In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from statsmodels.graphics.tsaplots import *

  import pandas.util.testing as tm


### Importing the master database and removing the unnecessary columns

In [16]:
df_md = pd.read_csv("/content/drive/My Drive/CSV Files/masterDB.csv")

In [17]:
del df_md['RA']
del df_md['SN']
del df_md['FG']
del df_md['TS']
del df_md['VV']
del df_md['index']

### Dropping the NA values from the sealevel column

In [18]:
df_md.drop(df_md[df_md['sealevel'] ==  -99999].index, inplace = True)

### Checking the initial correlation

In [19]:
df_md.corr().round(3)

Unnamed: 0,Year,sealevel,avgT,maxT,minT,slp,avgRH,PP,avgW,VM,CO2_percapita
Year,1.0,0.91,0.382,0.324,0.416,0.026,-0.279,0.19,-0.604,-0.667,0.966
sealevel,0.91,1.0,0.194,0.194,0.197,0.321,-0.446,0.016,-0.556,-0.631,0.867
avgT,0.382,0.194,1.0,0.889,0.949,-0.573,0.117,0.169,-0.011,-0.124,0.383
maxT,0.324,0.194,0.889,1.0,0.748,-0.359,-0.09,0.002,0.05,-0.06,0.327
minT,0.416,0.197,0.949,0.748,1.0,-0.667,0.298,0.3,-0.069,-0.156,0.407
slp,0.026,0.321,-0.573,-0.359,-0.667,1.0,-0.568,-0.458,-0.151,-0.151,0.018
avgRH,-0.279,-0.446,0.117,-0.09,0.298,-0.568,1.0,0.583,0.079,0.217,-0.24
PP,0.19,0.016,0.169,0.002,0.3,-0.458,0.583,1.0,-0.207,-0.123,0.184
avgW,-0.604,-0.556,-0.011,0.05,-0.069,-0.151,0.079,-0.207,1.0,0.92,-0.584
VM,-0.667,-0.631,-0.124,-0.06,-0.156,-0.151,0.217,-0.123,0.92,1.0,-0.606


### Standardizing the features

In [5]:
df_md['Std_sealevel'] = (df_md['sealevel']-df_md['sealevel'].mean())/df_md['sealevel'].std()
df_md['Std_avgT'] = (df_md['avgT']-df_md['avgT'].mean())/df_md['avgT'].std()
df_md['Std_slp'] = (df_md['slp']-df_md['slp'].mean())/df_md['slp'].std()
df_md['Std_avgRH'] = (df_md['avgRH']-df_md['avgRH'].mean())/df_md['avgRH'].std()
df_md['Std_PP'] = (df_md['PP']-df_md['PP'].mean())/df_md['PP'].std()
df_md['Std_avgW'] = (df_md['avgW']-df_md['avgW'].mean())/df_md['avgW'].std()
df_md['Std_CO2pc'] = (df_md['CO2_percapita']-df_md['CO2_percapita'].mean())/df_md['CO2_percapita'].std()

### Dropping unimportant features

In [7]:
df_md.drop(['sealevel','avgT','maxT','minT','slp','avgRH','PP','avgW','VM','CO2_percapita'], axis = 1 ,inplace=True)

### Checking the correlation

In [8]:
corr = df_md.corr()
corr.round(3)

Unnamed: 0,Year,Std_sealevel,Std_avgT,Std_slp,Std_avgRH,Std_PP,Std_avgW,Std_CO2pc
Year,1.0,0.91,0.382,0.026,-0.279,0.19,-0.604,0.966
Std_sealevel,0.91,1.0,0.194,0.321,-0.446,0.016,-0.556,0.867
Std_avgT,0.382,0.194,1.0,-0.573,0.117,0.169,-0.011,0.383
Std_slp,0.026,0.321,-0.573,1.0,-0.568,-0.458,-0.151,0.018
Std_avgRH,-0.279,-0.446,0.117,-0.568,1.0,0.583,0.079,-0.24
Std_PP,0.19,0.016,0.169,-0.458,0.583,1.0,-0.207,0.184
Std_avgW,-0.604,-0.556,-0.011,-0.151,0.079,-0.207,1.0,-0.584
Std_CO2pc,0.966,0.867,0.383,0.018,-0.24,0.184,-0.584,1.0


## Converting the standardized master database into a csv

In [9]:
pd.DataFrame(df_md).to_csv("Std_mdb.csv")