In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Lets import the basic libraries first then we will proceed to the others as the need arises
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# reading the test and train data
train_original  = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/train.csv')
test_original = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/test.csv')

In [None]:
# reading the sample
sample = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/sample_submission.csv')
sample.head()

In [None]:
# i like to do it this way i.e. preserve the original data.... just in case if i went wrong somewhere.
# it can also lead to confusion and multiplicity of variable so one has to be careful
train = train_original.copy()
test = test_original.copy()

In [None]:
# displaying the train data
train.head()

In [None]:
# lets see the test data as well
test.head()

# Checking the Basic data hygiene and preprocessing

In [None]:
train.shape

300000 rows and 16 columns

In [None]:
# lets check the null values if any
train.isnull().sum()

No null values here... so one less thing to deal with

In [None]:
# lets check if there are any duplicates
train.duplicated().sum()

Good no duplicates...

In [None]:
# lets check the test data
test.isnull().sum()

No missing values

In [None]:
test.shape

200000 rows and 15 columns

In [None]:
test.duplicated().sum()

No duplicates

In [None]:
train.info()

No categorical variables. Hence no encoding of variables will be required later

In [None]:
# lets drop the id column. Else it will interfere with the regression results. 
train.drop(['id'], axis=1,inplace=True)
test.drop(['id'], axis=1, inplace=True)

In [None]:
# lets see the stats for train dataset
train.describe().T

# EDA 1.0

lets explore the datasets. if there's any processing required, we will do it and perform the EDA again. Hence this is EDA 1.0

First lets check the distribution of the variables, and see how test and train datasets vary in this 

In [None]:
figure, ax = plt.subplots(7,2,figsize=(15,30))
c=1
for i in train.drop(['target'],axis=1).columns:
    plt.subplot(7,2,c)
    sns.distplot(train[i],color = 'blue', label='train')
    sns.distplot(test[i],color = 'red', label='test')
    c=c+1
    plt.xlabel(i, fontsize=9)
    plt.legend()
plt.show()

1. both the test and train datasets have the same distribution. So we can say that the test and train datasets are quite identical
2. Most variables are not normally distributed. (Not that they are required to be)
3. there could be some outliers

In [None]:
# lets checkhow variables vary with respect to the other. This will help us identify any multicollinearity
sns.pairplot(train)

Hmmm..... Independent variables do not show any specific correlation with each other. Also the target variable doesnot seem to be correlated with any of the independent variables.

We will further check for correlation in the heatmap. But even if any of the independent variables show any correlation there, it will be difficult to conclude that there is actually a correlation between them. To me the dataset variables seem like randomly generated numbers. 

In [None]:
# lets draw the correlation heatmap 
plt.figure(figsize=(20,20))
sns.heatmap(train.corr(),annot=True, cmap = 'coolwarm')

Heatmap above does show quite a few variables correlated. But again, it is difficult to ascertain that there is a correlation. Hence we will keep these variables in the dataset. Else I would have remove the correlated variables.

Also we should note that none of the independent variables show any correlation with the target. there seems to be too much noise in the dataset. We should expect low R-Square values in the Regression model.

In [None]:
# lets check for the outliers
plt.figure(figsize=(20,10))
sns.boxplot(data=train.drop(['target'],axis=1))
plt.title('The boxplot to study outliers')
plt.xlabel('Variables that predict the Target')
plt.ylabel('Values')

The scale of all the variables look similar. Hence there is no need to perform the scaling.

No outlier treatment is required since the test and the train data are absolutely identical.

Variables such as cont2, count4, cont5, cont8 are skewed and cont 11, con12, cont 14 have double peaks in the data. 

Typically these might represent the different clusters. 


# Analysing the target variable

In [None]:
# The target variable should be normally distributed all the values of the independent variables.
# lets check if the target variable is normally distributed
# lets check with the distplot first

sns.distplot(train['target'],color = 'blue', label='train')

The target variable is double peaked. Can't say if it is normally distributed. If it not we will need to transform it so that it becomes normal

In [None]:
from statsmodels.graphics.gofplots import qqplot
qqplot(train['target'], line='s')
plt.show()

From qq plot, it seems that the target is normally distributed.

# Train Test Split

In [None]:
# importing the libraries
from sklearn.model_selection import train_test_split
X = train.drop(['target'],axis=1)
y = train['target']
X_train, X_test,y_train, y_test = train_test_split(X,y, train_size=0.75, random_state=42) 

The train test function gives 4 values: 
1. first is the train dataframe (without target)
1. second value is the test dataframe(without target)
1. third is the training dataframe of target variable
1. fourth is the test dataframe of target variable

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
y_train.head()

In [None]:
y_test.head()

# Building Linear Regression Model

In [None]:
# import linear regression library

from sklearn.linear_model import LinearRegression

In [None]:
# defining a variable to store the linear regression function for ease of use

reg_model = LinearRegression()

In [None]:
reg_model.fit(X_train,y_train)

In [None]:
for i, col_name in enumerate(X_train.columns):
    print("The coefficient for",col_name, "is", reg_model.coef_[i])

In [None]:
# Let us check the intercept for the model

intercept = reg_model.intercept_

print("The intercept for our model is", intercept)

In [None]:
# R^2 value of the train dataset
reg_model.score(X_train, y_train)

Here we can see that the R square value is very low as expected. There is a lot of noise in the train as well as test datasets. Typically we would see some pattern in the data while plotting the correlation heatmap and the scatter plots.

For a good model the R^2 and adjusted R^2 values would be close to 1

In [None]:
# R^2 value of the test dataset
reg_model.score(X_test, y_test)

Same case with the test data, the R square value is very low

# Linear Regression using Statsmodels

R^2 is not a reliable metric as it always increases with addition of more attributes even if the attributes have no influence on the predicted variable.

Instead we use adjusted R^2 which removes the statistical chance that improves R^2.

Scikit does not provide a facility for adjusted R^2, so we use statsmodel, a library that gives results similar to what you obtain in R language. This library expects the X and Y to be given in one single dataframe

further we can immprove the model by backward elimination

In [None]:
data_train = pd.concat([X_train, y_train], axis=1)
data_train.head()

In [None]:
# forming the regression equation

reg_expression = 'target ~ cont1+cont2+cont3+cont4+cont5+cont6+cont7+cont8+cont9+cont10+cont11+cont12+cont13+cont14'

In [None]:
import statsmodels.formula.api as smf
model1 = smf.ols(formula=reg_expression, data=data_train).fit()
# displaying first 5 parameters
model1.params.head()

In [None]:
print(model1.summary())

In [None]:
from statsmodels.regression.linear_model import RegressionResults
np.sqrt(model1.mse_resid)

In [None]:
model1.rsquared_adj

In [None]:
# forming the second regression equation

reg_expression_2 = 'target ~ cont1+cont2+cont3+cont4+cont5+cont6+cont7+cont8+cont9+cont10+cont11+cont12+cont13'

In [None]:
model2 = smf.ols(formula=reg_expression_2, data=data_train).fit()
# displaying first 5 parameters
model2.params.head()

In [None]:
print(model2.summary())

In [None]:
np.sqrt(model2.mse_resid)

In [None]:
model2.rsquared_adj

In [None]:
# another way of finding RSME
from sklearn.metrics import mean_squared_error
print("model1 train RMSE:", np.sqrt(mean_squared_error(y_train, model1.predict(X_train))))
print("model1 test RMSE:", np.sqrt(mean_squared_error(y_test, model1.predict(X_test))))
print("model2 train RMSE:", np.sqrt(mean_squared_error(y_train, model2.predict(X_train))))
print("model2 test RMSE:", np.sqrt(mean_squared_error(y_test, model2.predict(X_test))))

# Ridge and Lasso

In [None]:
# Import linear models
from sklearn import linear_model
# Create lasso and ridge objects
lasso = linear_model.Lasso()
ridge = linear_model.Ridge()
# Fit the models
lasso.fit(X_train, y_train)
ridge.fit(X_train, y_train)
# Print scores, MSE, and coefficients
print("R^2_lasso score:", lasso.score(X_train, y_train))
print("R^2_ridge score:",ridge.score(X_train, y_train))
print("lasso RMSE:", np.sqrt(mean_squared_error(y_test, lasso.predict(X_test))))
print("ridge RMSE:", np.sqrt(mean_squared_error(y_test, ridge.predict(X_test))))
print("lasso coef:", lasso.coef_)
print("ridge coef:", ridge.coef_)

In [None]:
# lets compare the RSME values of all the models that we have built
print("model1 test RMSE:", np.sqrt(mean_squared_error(y_test, model1.predict(X_test))))
print("model2 test RMSE:", np.sqrt(mean_squared_error(y_test, model2.predict(X_test))))
print("lasso test RMSE:", np.sqrt(mean_squared_error(y_test, lasso.predict(X_test))))
print("ridge test RMSE:", np.sqrt(mean_squared_error(y_test, ridge.predict(X_test))))

out of all, model2 performs the best. 

with lasso since all the coefficients are 0, we will omit the lasso

In [None]:
test_predicted_ols = model2.predict(test)
test_predicted_ols

In [None]:
submission = test_original['id']
test_pred = pd.DataFrame(test_predicted_ols)
submission = pd.concat([submission,test_predicted_ols],axis=1)
submission.rename({0:'target'},axis=1,inplace=True)

In [None]:
submission.head()


In [None]:
# lets check the distribution of the predicted values of the v/s original target values

In [None]:
sns.distplot(train['target'],color = 'blue', label='train')
sns.distplot(submission['target'],color = 'red', label='test')

Well even though the model RSME was 0.73, we can see above, that the prediction might not be very good. Lets submit and check whats the score. 

In [None]:
submission.to_csv("result.csv", index = False, header = True)
