In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#visualization
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns 

# ignore error messages
import warnings
warnings.filterwarnings("ignore")

In [None]:
#assigning CSV files as pandas dataframes
example = pd.read_csv('../input/tabular-playground-series-feb-2021/sample_submission.csv')
train = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')

In [None]:
example

Datasets (test and train):

In [None]:
#about 66% of the rows of the train dataset and 1 less column (target)
test

In [None]:
#are there NaNs?
test.isnull().values.any()

In [None]:
train

In [None]:
#are there NaNs?
train.isnull().values.any()

## Explore the data:

In [None]:
train.describe(include='all')

In [None]:
train.corr().style.background_gradient(cmap='ocean')

In [None]:
#find possible outliers

#Drop ID for visualization only
noID=train.drop(columns=['id'])
boxplot1 =noID.boxplot(rot=45)

In [None]:
##Explore TARGET column
boxplot2=train.boxplot(column=['target'])

In [None]:
train.target.plot(kind='line', color='tan')

In [None]:
train.target.describe()

In [None]:
train.target.nunique()

In [None]:
train.target.value_counts()

In [None]:
train.plot(kind='density', subplots=True, layout=(26,1), sharex=False, figsize=(12,14))
plt.show()

In [None]:
# dfs of the different types of data 
cats=train[['id', 'cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7',
       'cat8', 'cat9']]
nums=train[['id','cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5',
       'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12',
       'cont13', 'target']]

In [None]:
cats.cat0.value_counts().plot(kind='barh', color='darkred', title='Column: Cat0',figsize=(6,3),edgecolor=(0,0,0))

In [None]:
sns.countplot(cats['cat1'], palette = "Set3", edgecolor=(0,0,0))
plt.xticks()

In [None]:
cats.cat2.value_counts().plot(kind='barh', title='Column: Cat2', color='darkorange', figsize=(6,3),edgecolor=(0,0,0))

In [None]:
sns.countplot(cats['cat3'], palette = "Set2", edgecolor=(0,0,0))
plt.xticks()

In [None]:
cats.cat4.value_counts().plot(kind='barh', color='purple', title='Column: Cat4',figsize=(8,3),edgecolor=(0,0,0))

In [None]:
sns.countplot(cats['cat5'], palette = "icefire", edgecolor=(0,0,0))
plt.xticks()

In [None]:
cats.cat6.value_counts().plot(kind='barh', color='forestgreen', title='Column: Cat6',figsize=(8,4),edgecolor=(0,0,0))

In [None]:
cats.cat7.value_counts().plot(kind='barh', color='navy', title='Column: Cat7',figsize=(9,2),edgecolor=(0,0,0))

In [None]:
sns.countplot(cats['cat8'], palette = "Set1", edgecolor=(0,0,0))
plt.xticks()

In [None]:
cats.cat9.value_counts().plot(kind='barh', color='salmon', title='Column: Cat9',figsize=(6,5),edgecolor=(0,0,0))

In [None]:
#Drop ID for description
noID2=cats.drop(columns=['id'])
noID2.describe(include='all')

In [None]:
from sklearn import preprocessing 
from sklearn.preprocessing import LabelEncoder

encoded1=noID2.apply(LabelEncoder().fit_transform)
encoded1

In [None]:
encoded1.corr().style.background_gradient(cmap='YlOrBr')

In [None]:
#merge 
merged1=  pd.merge(nums, encoded1, left_index=True, right_index=True, how="inner")
merged1

In [None]:
merged1.corr().style.background_gradient(cmap='cubehelix')

In [None]:
#Pre-process the dataset for ML
from sklearn import preprocessing

#MinMaxScaler is non-distorting feature-transformer method
scaled = preprocessing.MinMaxScaler()

minmaxed=scaled.fit_transform(merged1)

#  convert the scaled array to dataframe
df1 = pd.DataFrame(minmaxed)

#fix labels
df1.columns = ['id', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6',
       'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13',
       'target','cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6',
       'cat7', 'cat8', 'cat9' ]

In [None]:
df1

In [None]:
df1.corr().style.background_gradient(cmap='rocket')

## Regression
Goal: Finding the best-fitting line    
Types:   
   ##1-Linear Regression (Single and multiple variables).    
  
   ##3-Bayesian Linear Regression    
   ##4-SVR      

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression

#Linear regression
X = df1['cat0'].values.reshape(-1,1)
y = df1['target'].values.reshape(-1,1)


In [None]:
#split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

regressor = LinearRegression()  
regressor.fit(X_train, y_train) #train the model

In [None]:
#Intercept
print(regressor.intercept_)
#Slope
print(regressor.coef_)

In [None]:
#Predict
pred1 = regressor.predict(X_test)

#actual value and predicted value
LinReg = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': pred1.flatten()})
LinReg

In [None]:
#regression line

plt.scatter(X_test, y_test,  color='seagreen')
plt.plot(X_test, pred1, color='blue', linewidth=3)
plt.show()

In [None]:
# first 10 predictions
first10preds=LinReg.head(10)
c='darkorange', 'darkcyan'
first10preds.plot(kind='bar',figsize=(9,6), color=c)
plt.grid(which='major', linestyle='-', linewidth='0.3', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

In [None]:
#evaluate models
from sklearn.metrics import mean_squared_error 
from numpy import sqrt

mse = mean_squared_error(y_test, pred1)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % sqrt(mse)) 

In [None]:
##Multiple variable regression

X2 = df1[['cat1', 'cat3']].values
y2 = df1['target'].values 

In [None]:
#split the data
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.3, random_state=2)

Tworegs = LinearRegression()  
Tworegs.fit(X_train, y_train) #train the model

In [None]:
#Predict
pred2 = Tworegs.predict(X_test)

#actual value and predicted value
MultReg = pd.DataFrame({'Actual': y_test, 'Predicted': pred2.flatten()})
MultReg

In [None]:
# first 10 predictions
first10preds=MultReg.head(10)
c='dimgray', 'darkred'
first10preds.plot(kind='line',figsize=(7,4), color=c)
plt.show()

In [None]:
mse = mean_squared_error(y_test, pred2)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % sqrt(mse)) 

In [None]:
sansTarget=df1[['id', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6',
       'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13',
       'cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6',
       'cat7', 'cat8', 'cat9']]

In [None]:
#Bayesian Ridge
from sklearn.linear_model import BayesianRidge 

X3, y3 = sansTarget.values, df1.target.values 
  
# Splitting dataset into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X3, y3, test_size = 0.6670, random_state = 3) 
  
# Creating and training model 
model3 = BayesianRidge() 
model3.fit(X_train, y_train) 
  
# Model making a prediction on test data 
pred3 = model3.predict(X_test) 

In [None]:
#Predict
pred3 = model3.predict(X_test)

#actual value and predicted value
Bay = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': pred3.flatten()})
Bay

In [None]:
x_ax = range(len(y_test))
plt.scatter(x_ax, y_test, s=3, color="sienna", label="Actual")
plt.plot(x_ax, pred3, lw=0.5, color="lightseagreen", label="Predicted")
plt.show() 

In [None]:
# first 10 predictions
first10preds=Bay.head(10)
c='purple', 'salmon'
first10preds.plot(kind='barh',figsize=(9,6), color=c)
plt.grid(which='major', linestyle='-', linewidth='0.3', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

In [None]:
#a value of 0 (almost never achieved in practice) would indicate a perfect fit to the data. 
#In general, a lower RMSD is better than a higher one.

mse = mean_squared_error(y_test, pred3)

print("MSE: %.2f" % mse)
print("RMSE: %.2f" % sqrt(mse)) 

In [None]:
#save

prediction= pd.DataFrame(pred3)

prediction.rename(columns = {0:'target'}, inplace=True)

prediction['id'] = prediction.index

In [None]:
#need 200000 rows
result=prediction[['id','target']].head(200000)
result

In [None]:
#df of results to csv
result.to_csv('result.csv',index=False)