In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os

In [2]:
path = os.path.join(os.path.abspath(os.pardir),"paribas_data.csv")
paribas_data = pd.read_csv(path, nrows=10000)
paribas_data.shape

(10000, 133)

In [3]:
# removing the non-numeric columns from the dataset
num_colums = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_columns = list(paribas_data.select_dtypes(include=num_colums).columns)
paribas_data = paribas_data[numerical_columns]
paribas_data.shape

(10000, 114)

In [4]:
X_train,X_test,y_train,y_test = train_test_split(
    paribas_data.drop(labels=['target', 'ID'], axis=1),
    paribas_data['target'],
    test_size=0.3,
    random_state=41)

In [10]:
# Creating a set of all the columns with a correlation of greater than 0.8
correlated_features = set()
correlation_matrix = paribas_data.corr()
for i in range(len(correlation_matrix .columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

In [11]:
# Removing the columns as per above set
X_train.drop(labels=correlated_features, axis=1, inplace=True)
X_test.drop(labels=correlated_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((7000, 57), (3000, 57))

In [17]:
# replacing nan values with zero
X_train.replace(np.nan, 0, inplace=True)
X_test.replace(np.nan, 0, inplace=True)

In [22]:
from sklearn.linear_model import LinearRegression
lin = LinearRegression()
lin.fit(X_train,y_train)
lin.score(X_test,y_test)

0.06953017251728322

In [23]:
# score_func of Annova
from sklearn.feature_selection import f_regression
# class that accepts the score_func parameters
from sklearn.feature_selection import SelectKBest
annova = SelectKBest(score_func=f_regression,k=10)
X_train_annova = annova.fit_transform(X_train,y_train)
X_test_annova = annova.transform(X_test)
lin_annova = LinearRegression()
lin_annova.fit(X_train_annova, y_train)
lin_annova.score(X_test_annova, y_test)

0.07326154993912137