In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/test.csv')
sample_submission = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/sample_submission.csv')

# First look at the data

The dataset is just numbers.. numbers only. This is the first time I will be working with data where there is no context at all..

The dataset has no missing values. 

In [None]:
train.head(5)

In [None]:
train.describe()

In [None]:
train.info()

# Submission file as provided

In [None]:
filename = 'submission_as_provided.csv'
sample_submission.to_csv(filename,index=False)
print('Saved file: ' + filename)

Submission file as provided gave a score of 7.44666

# Lazy model

Building a model on the data, as it is..

In [None]:
train.columns

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from xgboost import XGBRegressor

y = train['target']
X = train[['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14']]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=True)

models = [DecisionTreeRegressor(), LinearRegression(), Ridge(),  Lasso(), XGBRegressor()]

for model in models:
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    from sklearn import metrics
    print('Model:', model)
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print('r2_score:', metrics.r2_score (y_test, y_pred))
    print('-------------------------------------')


In [None]:
X_for_submission = test[['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14']]

model = LinearRegression()
model.fit(X, y)

pred = model.predict(X_for_submission)

sample_submission['target'] = pred

filename = 'submission_lazy_model.csv'
sample_submission.to_csv(filename,index=False)
print('Saved file: ' + filename)

Checking with the linear regression model.

Above file got a score of 0.72782

In [None]:
X_for_submission = test[['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14']]

model = XGBRegressor()
model.fit(X, y)

pred = model.predict(X_for_submission)

sample_submission['target'] = pred

filename = 'submission_lazy_model_XGBRegressor.csv'
sample_submission.to_csv(filename,index=False)
print('Saved file: ' + filename)

Above file got a score of 0.70495

# Correlation matrix

- cont9 and cont1 are highly positively correlated.
- cont6 and cont9 to cont13 are highly positively correlated.

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 9))
sns.heatmap(train.corr(),  annot=True, fmt='.2f')
plt.show()

# Removing the cont6 feature

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from xgboost import XGBRegressor

y = train['target']
X = train[['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14']]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=True)

models = [DecisionTreeRegressor(), LinearRegression(), Ridge(),  Lasso(), XGBRegressor()]

for model in models:
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    from sklearn import metrics
    print('Model:', model)
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print('r2_score:', metrics.r2_score (y_test, y_pred))
    print('-------------------------------------')


In [None]:
X_for_submission = test[['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14']]

model = XGBRegressor()
model.fit(X, y)

pred = model.predict(X_for_submission)

sample_submission['target'] = pred

filename = 'submission_cont6_removed_XGBRegressor.csv'
sample_submission.to_csv(filename,index=False)
print('Saved file: ' + filename)

# Exploring the target value

The target value is has a bimodal distribution.

*In statistics, a Multimodal distribution is a probability distribution with two different modes, which may also be referred to as a bimodal distribution. These appear as distinct peaks (local maxima) in the probability density function, as shown below. Categorical, continuous, and discrete data can all form bimodal distributions*

*More generally, a multimodal distribution is a probability distribution with two or more modes*

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 9))
sns.distplot(train['target'])
plt.title('Distribution of target value', fontsize = 20, c='black')
plt.show()

# Clustering experimentation

In [None]:
X = train[['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14']]

from sklearn.cluster import KMeans

sse = []
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, random_state=42, init='random', n_init=10, max_iter=10)
    kmeans.fit(X)
    sse.append(kmeans.inertia_)

f, ax = plt.subplots(1,1,figsize=(15,9))
plt.plot(range(1, 10), sse)
plt.xticks(range(1, 10))
#ax.annotate('Optimal number of clusters', xy=(2.05,1370000))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.title('SSE for different number of clusters', fontsize = 20, c='black')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=2, random_state=0, init='random', n_init=10, max_iter=10)
kmeans.fit(X)
train['Cluster']=kmeans.predict(X)
train

Hmm.. I wanted to get clusters which would give some kind of a unimodal distribution.. but I got this instead..

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 9))
sns.distplot(train[train['Cluster']==0]['target'])
sns.distplot(train[train['Cluster']==1]['target'])
plt.title('Distribution of target value', fontsize = 20, c='black')
plt.show()

# Exploring the continuous variables 1 - 14

In [None]:
continuous_features = ['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
                       'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14']

for continuous_feature in continuous_features:
    fig, ax = plt.subplots(1, 1, figsize=(15, 9))
    sns.distplot(train[continuous_feature])
    plt.title('Distribution of ' + continuous_feature , fontsize = 20, c='black')
    plt.show()

# Sweetviz

In [None]:
!pip install sweetviz

In [None]:
import sweetviz as sv

In [None]:
my_report = sv.analyze(train)

In [None]:
my_report.show_notebook()