# Worksheet for Correlation Analysis

In [1]:
import os.path
try:
    from urllib2 import urlopen
except ImportError:
    from urllib.request import urlopen
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm
import dcor

### Download data and load into a dataframe

In [2]:
#### LOAD THE HELPER FUNCTIONS BELOW####
#### IMPORTANT: DONOT change these functions or your final submission will not evaluate correctly###

## This downloads your datafile, Do not change this function
def downloadFile(dataSetId):
    fileName = '%s.csv' % (dataSetId)
    url = 'https://s3.us-east-2.amazonaws.com/qq10-data/' + fileName
    print(url)

    response = urlopen(url)
    status = response.getcode()
    if status == 200:
      print('Downloading the dataset %s' % (fileName))
      with open(fileName, 'w') as f:
          f.write(response.read().decode('utf8'))
      return True
    else:
      logError('File not found. Please ensure you are working with correct data set Id')
      return False


In [None]:
filename = 'feature_data'
if not os.path.isfile('%s.csv'%filename):
    downloadFile('%s'%filename)
df_train = pd.read_csv('%s.csv'%filename)
df_train.columns

In [None]:
filename = 'target_variable_data'
if not os.path.isfile('%s.csv'%filename):
    downloadFile('%s'%filename)
y_train = pd.read_csv('%s.csv'%filename)
y_train.columns

#### View descriptive statistics of each dataset

In [None]:
df_train.describe().T

In [None]:
y_train.describe().T

#### Explore a single target variable

In [None]:
y_train['A1'].plot(kind="hist")

In [None]:
sns.distplot(y_train['A1'], fit = norm)

In [None]:
y_train['A1'].skew(), y_train['A1'].kurt()

#### Try transformations on the target variable

In [None]:
log_a1 = np.log1p(y_train['A1'])
sns.distplot(log_a1, fit = norm)

#### Other descriptive plots

In [None]:
y_train.hist(bins=50, figsize=(30,20));

#### Explore relationships with explanatory variables

In [None]:
var = 'Alpha_A1_1'
data = pd.concat([y_train['A1'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='A1')

In [None]:
var = 'Beta_A_1'
data = pd.concat([y_train['A1'], df_train[var]], axis=1)
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x=var, y='A1', data=data)
plt.xticks(rotation=90);

In [None]:
sns.set()
cols = ['Alpha_A1_1', 'Alpha_A1_2', 'Alpha_A1_3', 'Alpha_A1_4', 'Alpha_A1_5', 'Alpha_A1_6', 'Alpha_A1_7', 'Alpha_A1_8', 'Alpha_A1_9', 'Alpha_A1_10']
sns.pairplot(df_train.filter(regex='_A1'), size = 2.5)
plt.show();

#### Explore intercorrelations

In [None]:
corrmat = df_train.filter(regex='_A1').corr(method='spearman')

In [None]:
var = 'Alpha_A1_1'
f, ax = plt.subplots(figsize=(12, 10))
k = 25 #number of variables for heatmap
cols = corrmat.nlargest(k, var)[var].index
cm = np.corrcoef(df_train[cols].values.T)
sns.heatmap(cm, ax=ax, cmap="YlGnBu", linewidths=0.1, yticklabels=cols.values, xticklabels=cols.values)

In [None]:
cg = sns.clustermap(cm, cmap="YlGnBu", linewidths=0.1);
plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
cg

#### Explore correlations with target variable

In [None]:
tv = 'A1'
df_tv = df_train.filter(regex='_A1').join(y_train[tv])
corrmat = df_tv.corr(method='spearman')
f, ax = plt.subplots(figsize=(12, 10))

k = 50 #number of variables to explore
cols = corrmat.nlargest(k, tv)[tv].index
cm = np.corrcoef(df_tv[cols].values.T)
sns.heatmap(cm, ax=ax, cmap="YlGnBu", linewidths=0.1, yticklabels=cols.values, xticklabels=cols.values)

In [None]:
for i in range(k):
    temp_df = pd.DataFrame(df_tv[cols[i]], index = df_tv.index, columns=[cols[i], tv])
    temp_df[tv] = df_tv[tv]
    print(temp_df.corr(method='pearson'))
    plt.plot(temp_df[cols[i]], temp_df[tv], '.b')
    plt.show()

In [None]:
#Code for discarding uncorrelated features from the feature set.( This is a test for feature A1)
Var='A1'
# for alpha features.
features=[]
final_features=[]
count=0
index='Alpha_'+ Var
index2='Beta_'+Var[0]
index3='Beta_Z_'
filter_col = [col for col in df_train if (col.startswith(index) or col.startswith(index2) or col.startswith(index3))]
lenght=len(filter_col)
for i in range(0,lenght):
    combined = pd.concat([df_train[filter_col[i]],y_train[Var]], axis=1)
    result=combined.dropna()
    correlation=dcor.distance_correlation(result[Var],result[filter_col[i]])
    print(correlation,filter_col[i])
    if(correlation>0.4):
        features.append(filter_col[i])
        count=count+1

data=pd.DataFrame(df_train, columns = features)
print(data)

corr = data.corr()
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.95 or corr.iloc[i,j] <= -0.95 :
            if columns[j]:
                columns[j] = False
selected_features = data.columns[columns]
print(selected_features)
print(len(selected_features))


# Method 2 of discarding correlated features
corr = data.corr()
m = ~(corr.mask(np.eye(len(corr), dtype=bool)).abs() > 0.90 ).any()
print m
raw = corr.loc[m, m]
print(raw)
selected_columns= list(raw)
print(selected_columns)
print(len(selected_columns))





   



In [None]:

# Importing the data and performing the feature Engineering 
import pandas as pd
data=pd.read_csv("alldata.csv")
col=['A1', 'A2', 'B1', 'B2', 'C1', 'C2', 'D1', 'D2', 'E1', 'a1', 'a2', 'b1', 'b2', 'c1', 'c2', 'd1', 'd2', 'e1']
targets= data[col]
data.drop(col, axis=1, inplace=True)


In [None]:
import os.path
try:
    from urllib2 import urlopen
except ImportError:
    from urllib.request import urlopen
import numpy as np
import pandas as pd
import dcor

def getCorrelatedFeatures(df_train,target):
    #Code for discarding uncorrelated features from the feature set.
    temp=list(target)
    Var=temp[0]
    cap=Var.capitalize()
    features=[]
    final_features=[]
    count=0
    index_1='Alpha_'+ cap
    index_2='Beta_'+cap[0]
    index_3='Beta_Z_'
    filter_col = [col for col in df_train if (col.startswith(index_1) or col.startswith(index_2) or col.startswith(index_3))]
    lenght=len(filter_col)
    # discards features based on Distance Correlation (Features with less than 40% correlation are discarded)
    for i in range(0,lenght):
        combined = pd.concat([df_train[filter_col[i]],target[Var]], axis=1)
        result=combined.dropna()
        correlation=dcor.distance_correlation(result[Var],result[filter_col[i]])
        if(correlation>0.2):
            features.append(filter_col[i])
            count=count+1

    template=pd.DataFrame(df_train, columns = features)


    # The code below discards all Collinear features.(First sumbmission)
    corr = template.corr()
    columns = np.full((corr.shape[0],), True, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i+1, corr.shape[0]):
            if corr.iloc[i,j] >= 0.95 or corr.iloc[i,j] <= -0.95 :
                if columns[j]:
                    columns[j] = False
    selected_features = template.columns[columns]
    return(selected_features)
 
''' # Third submission
    data=pd.DataFrame(df_train, columns = features)
    corr = data.corr()
    m = ~(corr.mask(np.eye(len(corr), dtype=bool)).abs() > 0.90 ).any()
    raw = corr.loc[m, m]
    selected_columns= list(raw)
    return(selected_columns)
    '''
 #return(features) Second Submission
target=[]
target=targets[['A1']]
main_features= getCorrelatedFeatures(data,target)



In [None]:
import numpy
import keras
import tensorflow
import os
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, LSTM, GRU, Conv1D,LocallyConnected1D
from keras.layers.advanced_activations import PReLU
from keras.optimizers import Adam
from matplotlib import pyplot as pyplot
import numpy as np


X=pd.DataFrame(data, columns = main_features)
Y=target
no_features=len(main_features)
X_train=X.head(200)
Y_train=Y.head(200)
X_test=X.tail(40)
Y_test=Y.tail(40)

X_train=X_train.values
X_test=X_test.values
Y_train=Y_train.values
Y_test=Y_test.values



numpy.random.seed(1)

model = Sequential()
model.add(Dense(150, bias_initializer='zeros', activation='relu',input_dim=no_features))
model.add(Dense(80, bias_initializer='zeros', activation='relu'))
model.add(Dense(40, bias_initializer='zeros', activation='relu'))
model.add(Dense(15, bias_initializer='zeros', activation='tanh'))
model.add(Dense(1, bias_initializer='zeros', activation='linear'))

   
# compile
optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999)
model.compile(loss='mean_squared_error', optimizer=optimizer) #, metrics=['accuracy'])
    
# fit
model.fit(X_train,
Y_train, epochs=200, batch_size=50)
        
   
# evaluate
score = model.evaluate(X_test, Y_test, batch_size=10)
predictions = model.predict(X_test, batch_size=10)
print("evaluation score:", score)
    


In [None]:
import sklearn
from sklearn.metrics import r2_score

R=sklearn.metrics.r2_score(Y_test, predictions)
print(R)