# Project 1 - Aurèle Bohbot, Quentin Guilhot, Yanis Tournier

In [135]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as skl
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, ShuffleSplit
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LassoCV
from sklearn.metrics import r2_score, mean_squared_error
from collections import Counter

In [136]:
X = pd.read_csv('data/X_train.csv').drop(columns=['id'])
Xt = pd.read_csv('data/X_test.csv')
y = pd.read_csv('data/y_train.csv')
y = y.drop(columns=['id'])

In [137]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [138]:
# number of samples 
n_samples = X_train.shape[0]
n_features = X_train.shape[1]

# number of maximum features
n_max_features = int(np.sqrt(n_samples))

In [139]:
n_max_features

29

### Experimental section

#### Normalization of the data

In [140]:
# normalizing X_train

X_train_norm = X_train.copy()
for col in X_train.columns:
    X_train_norm[col] = (X_train[col] - X_train[col].mean()) / X_train[col].std()

# normalizing X_test
X_test_norm = X_test.copy()
for col in X_test.columns:
    X_test_norm[col] = (X_test[col] - X_test[col].mean()) / X_test[col].std()


In [141]:
# cleaning of X_train

X_train_norm=X_train_norm.fillna(X_train_norm.median())

# cleaning of X_test

X_test_norm=X_test_norm.fillna(X_test_norm.median())

# removing columns for which there are only nans

columns_to_remove = []
for column in X_test_norm.columns:
    if X_test_norm[column].isna().sum() > 0 or X_train_norm[column].isna().sum():
        columns_to_remove.append(column)

X_train_norm.drop(columns_to_remove, inplace=True, axis=1)
X_test_norm.drop(columns_to_remove, inplace=True, axis=1)

First regression without any additional treatment


In [142]:
model=LinearRegression()
model.fit(X_train_norm,y_train)

y_pred=model.predict(X_test_norm)
r2_score(y_pred, y_test)

-0.019188489565065137

### Feature selection

In [143]:
from sklearn.decomposition import PCA

pca = PCA(n_components=n_max_features)
X_train_norm = pca.fit_transform(X_train_norm)
X_test_norm = pca.transform(X_test_norm)
explained_variance = pca.explained_variance_ratio_

In [144]:
print(np.sum(explained_variance))

0.27286401474106725


In [145]:
X_train_norm.shape

(848, 29)

In [146]:
model=LinearRegression()
model.fit(X_train_norm,y_train)

y_pred=model.predict(X_test_norm)
r2_score(y_pred, y_test)

-1.094267018234106

### Handling outliers

In [147]:
from sklearn.covariance import EllipticEnvelope
pred = EllipticEnvelope(random_state=0).fit_predict(X_train_norm)
outlier_index = np.where(pred==-1)
outlier_values = X_train_norm[outlier_index]
inlier_index = np.where(pred==1)
X_train_norm_inliers = X_train_norm[inlier_index]


In [148]:
X_train_norm_inliers.shape

(763, 29)

In [149]:
inlier_index = np.where(pred==1)
X_train_norm_inliers = X_train_norm[inlier_index]
y_train_inliers = y_train[y_train.index.isin([y_train.index[i] for i in range(len(y_train.index)) if i in list(inlier_index[0])])]

In [150]:
model = LinearRegression()
model.fit(X_train_norm_inliers, y_train_inliers)
y_pred = model.predict(X_test_norm)
y_pred.shape
score = r2_score(y_test, y_pred)
score

0.33084901266568156

In [151]:
y_train = y_train[y_train.index]

KeyError: "None of [Int64Index([ 874,  670,  931,  482,  927,  947,  548,  354, 1060,  423,\n            ...\n            1123,   87,  330,  466,  121, 1044, 1095, 1130,  860, 1126],\n           dtype='int64', length=848)] are in the [columns]"

In [None]:
def detection(df,features):
    outlier_indices=[]
    
    for c in features:
        Q1, Q3 = np.percentile(df[c],25), np.percentile(df[c],75)
        
        #IQR calculation
        IQR = Q3 - Q1
        outlier_step = IQR * 1.5
        lower_range = Q1 - (outlier_step)
        upper_range = Q3 + (outlier_step)
        
        #Outlier detection                                    #Outlier indexes
        outlier_list_col=df[  (df[c] < lower_range) | (df[c] > upper_range)  ].index
       
        #Store indexes
        outlier_indices.extend(outlier_list_col)
        
    outlier_indices=Counter(outlier_indices)
    # number of outliers
    # If we have more then 2 outliers in a sample, this sample ll be drop
    multiple_outliers = list(i for i, v in outlier_indices.items() if v > 2 )
    #we are taking indexes
    
    return multiple_outliers

### Handling missing values 

For the moment, we replace the missing values by the column's median. Depending on the results, we might implement a knn algorithm.

In [None]:
X_filled = X_train.fillna(X_train.median())

In [None]:
X_train_norm = X_filled.copy()
for col in X_filled.columns:
    X_train_norm[col] = (X_filled[col] - X_filled[col].mean()) / X_filled[col].var()
X_train_norm

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x822,x823,x824,x825,x826,x827,x828,x829,x830,x831
874,-0.000191,-0.000015,0.001862,0.000215,-0.154380,-0.009335,0.002314,-0.002417,0.003107,0.777743,...,0.001009,0.000014,0.006812,0.384590,-0.140845,0.000306,3.768914,-0.006653,-0.021844,0.001487
670,-0.000021,-0.000237,-0.000148,-0.000141,0.505886,0.092102,0.000029,-0.000460,-0.004630,0.433480,...,-0.001007,-0.000066,0.005971,-0.003462,0.399401,0.001831,0.524528,-0.000215,-0.002161,0.000899
931,-0.000013,-0.000724,-0.002035,-0.000286,0.137264,-0.073407,0.000816,0.004557,0.005539,1.515610,...,-0.000212,0.000161,0.000315,-0.641024,0.277172,-0.000523,2.442379,-0.008166,-0.062831,0.000397
482,-0.000035,0.000053,-0.000260,-0.000093,-0.559518,0.052546,-0.001279,-0.002651,0.001732,-0.117031,...,0.001693,0.000398,0.003330,0.042626,0.240126,-0.002526,-0.540456,-0.006263,-0.014049,-0.000402
927,0.000057,0.000565,0.000552,0.000201,-0.650713,-0.166438,0.000268,0.004325,-0.000078,1.636243,...,-0.001827,0.000439,0.002366,0.556094,0.348091,0.001144,-3.817613,-0.005040,-0.051506,-0.000167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1044,-0.000017,0.000201,-0.000747,-0.000146,-0.299233,-0.074816,0.001337,0.000294,0.002046,-0.035358,...,0.000124,0.000096,-0.000556,0.597515,-0.241668,0.001819,-0.548766,-0.000014,0.010332,0.001732
1095,-0.000130,-0.000578,-0.001493,0.000049,-0.291664,-0.044665,-0.002494,0.004691,-0.004626,-0.531617,...,0.000950,0.000117,-0.001333,-0.554086,0.021501,0.001412,5.425245,-0.005304,-0.047635,-0.001013
1130,-0.000310,-0.000022,-0.001574,-0.000002,0.501948,0.019427,-0.000673,0.000029,0.005545,0.605537,...,-0.000169,0.000546,-0.000556,-0.217587,0.430237,0.000164,1.563517,0.002265,0.041897,0.000199
860,0.000509,0.001001,0.003753,0.000035,0.597933,-0.023415,0.001096,-0.000490,-0.001070,-0.840170,...,-0.002403,0.000560,0.007744,-0.231753,0.597580,0.003269,1.454245,0.003957,-0.007954,-0.002471


In [None]:
d = {}
for col in X_train_norm.columns:
    Q1, Q3 = np.percentile(X_train_norm[col],25), np.percentile(X_train_norm[col],75)
    IQR = Q3 - Q1
    outlier_step = IQR * 2
    lower_range = Q1 - (outlier_step)
    upper_range = Q3 + (outlier_step)
    d[col] = (lower_range, upper_range)
for i, row in X_train_norm.iterrows():
    for col in X_train_norm.columns:
        if row[col] < d[col][0] or row[col] > d[col][1]:
            X_train_norm.loc[i,col]=0
X_train_norm

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x822,x823,x824,x825,x826,x827,x828,x829,x830,x831
874,-0.000191,-0.000015,0.001862,0.000215,-0.154380,-0.009335,0.002314,-0.002417,0.003107,0.777743,...,0.001009,0.000014,0.006812,0.384590,-0.140845,0.000306,3.768914,-0.006653,-0.021844,0.001487
670,-0.000021,-0.000237,-0.000148,-0.000141,0.505886,0.092102,0.000029,-0.000460,-0.004630,0.433480,...,-0.001007,-0.000066,0.005971,-0.003462,0.399401,0.001831,0.524528,-0.000215,-0.002161,0.000899
931,-0.000013,-0.000724,-0.002035,-0.000286,0.137264,-0.073407,0.000816,0.004557,0.005539,1.515610,...,-0.000212,0.000161,0.000315,-0.641024,0.277172,-0.000523,2.442379,-0.008166,-0.062831,0.000397
482,-0.000035,0.000053,-0.000260,-0.000093,-0.559518,0.052546,-0.001279,-0.002651,0.001732,-0.117031,...,0.001693,0.000398,0.003330,0.042626,0.240126,-0.002526,-0.540456,-0.006263,-0.014049,-0.000402
927,0.000057,0.000565,0.000552,0.000201,-0.650713,-0.166438,0.000268,0.004325,-0.000078,1.636243,...,-0.001827,0.000439,0.002366,0.556094,0.348091,0.001144,-3.817613,-0.005040,-0.051506,-0.000167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1044,-0.000017,0.000201,-0.000747,-0.000146,-0.299233,-0.074816,0.001337,0.000294,0.002046,-0.035358,...,0.000124,0.000096,-0.000556,0.597515,-0.241668,0.001819,-0.548766,-0.000014,0.010332,0.001732
1095,-0.000130,-0.000578,-0.001493,0.000049,-0.291664,-0.044665,-0.002494,0.004691,-0.004626,-0.531617,...,0.000950,0.000117,-0.001333,-0.554086,0.021501,0.001412,5.425245,-0.005304,-0.047635,-0.001013
1130,-0.000310,-0.000022,-0.001574,-0.000002,0.501948,0.019427,-0.000673,0.000029,0.005545,0.605537,...,-0.000169,0.000546,-0.000556,-0.217587,0.430237,0.000164,1.563517,0.002265,0.041897,0.000199
860,0.000509,0.001001,0.003753,0.000035,0.597933,-0.023415,0.001096,-0.000490,-0.001070,-0.840170,...,-0.002403,0.000560,0.007744,-0.231753,0.597580,0.003269,1.454245,0.003957,-0.007954,-0.002471


### Handling feature selections

In [None]:
lasso = Lasso()
lasso.fit(X_train_norm, y_train)

ValueError: Input X contains NaN.
Lasso does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
len([coef for coef in lasso.coef_ if coef!=0])

In [None]:
y_pred=lasso.predict(X_test)

In [None]:
r2_score(y_pred, y_test)

In [None]:
pca=PCA()
pca.fit(X_train_norm, y_train)

df = pd.DataFrame({"pca":pca.get_feature_names_out(),"sv":pca.singular_values_})
fig = px.bar(df, x='pca', y='sv')
fig.show()

In [None]:
X_pca = pca.transform(X_train_norm)
model = LinearRegression()
model.fit(X_pca, y_train)

In [None]:
y_pred = model.predict(pca.transform(X_test))
score = r2_score(y_test, y_pred)
score

In [None]:
plt.figure(figsize=(8,8))
plt.bar(pca.get_feature_names_out(),pca.singular_values_)
plt.show()