In [None]:
#---------------------------------------------------------------------------
# Import Libraries
#---------------------------------------------------------------------------

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import string
import os
import time

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [None]:
#---------------------------------------------------------------------------
# Print Location of files
#---------------------------------------------------------------------------
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#---------------------------------------------------------------------------
# Read train, test files
#---------------------------------------------------------------------------
train = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/test.csv')
submission = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/sample_submission.csv')

In [None]:
#---------------------------------------------------------------------------
# View csv files
#---------------------------------------------------------------------------
train.head()

In [None]:
test.head()

In [None]:
submission.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
#---------------------------------------------------------------------------
# Make target variable and remove useless files
#---------------------------------------------------------------------------
target = train.target
train.drop(columns=['id','target'], inplace=True)
test.drop(columns=['id'],inplace=True)

In [None]:
#---------------------------------------------------------------------------
# Show all categorical values
#---------------------------------------------------------------------------
c_unq,c_counts = list(),list()
for i in range(10):
    cat = 'cat' + str(i)
    unq,counts = np.unique(train[cat].values, return_counts=True)
    c_counts.append(counts)
    c_unq.append(unq)
    print(cat,"---->",unq,counts)

In [None]:
#---------------------------------------------------------------------------
# Convert all categorical values into Label Encoding
#---------------------------------------------------------------------------
for col in train.columns:
    if train[col].dtype == 'object':
        le = LabelEncoder()
        train[col] = le.fit_transform(train[col])
        
        
for col in test.columns:
    if test[col].dtype == 'object':
        le = LabelEncoder()
        
        test[col] = le.fit_transform(test[col])

In [None]:
train.head()

In [None]:
test.head()

In [None]:
idx = dict(zip(string.ascii_uppercase,range(0,15)))
print(idx)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
...      train,target,test_size=0.25, random_state=2)

In [None]:
X_train

In [None]:
y_train

In [None]:
#---------------------------------------------------------------------------
# Linear Regression Model
#---------------------------------------------------------------------------
start = time.time()
lr = LinearRegression()
lr.fit(X_train,y_train)
y_lr = lr.predict(X_test)
error_lr = mean_squared_error(y_test,y_lr,squared=False)
print(error_lr)
end = time.time()

print("Time Taken {} seconds".format(end - start))

In [None]:
#---------------------------------------------------------------------------
# Random Forest Model
#---------------------------------------------------------------------------
start = time.time()
rf = RandomForestRegressor(n_estimators=100,max_depth=9,n_jobs=-1)
rf.fit(X_train,y_train)
y_rf = rf.predict(X_test)
error_rf = mean_squared_error(y_test,y_rf,squared=False)
print(error_rf)
end = time.time()

print("Time Taken {} seconds".format(end - start))

In [None]:
#---------------------------------------------------------------------------
# L1 Model
#---------------------------------------------------------------------------
start = time.time()
rid = Ridge()
rid.fit(X_train,y_train)
y_rid = rid.predict(X_test)
error_rid = mean_squared_error(y_test,y_rid,squared=False)
print(error_rid)
end = time.time()

print("Time Taken {} seconds".format(end - start))

In [None]:
#---------------------------------------------------------------------------
# XG Boost Model
#---------------------------------------------------------------------------
start = time.time()
xg = XGBRegressor(n_estimators = 150,learning_rate = 0.045)

xg.fit(X_train,y_train)

y_xg = xg.predict(X_test)
error_xg = mean_squared_error(y_test,y_xg)
print(error_xg)
end = time.time()

print("Time Taken {} seconds".format(end - start))

In [None]:
def plot_cal_result(model,y,y_pred,total_plots = 10000,figsize = (6,6)):
    plt.figure(figsize = figsize)
    error = mean_squared_error(y,y_pred,squared = False)
    plt.scatter(y[:total_plots],y_pred[:total_plots])
    plt.title("{} : {}".format(name,error),fontsize = 15)
    plt.show()
    
    return error

In [None]:
start = time.time()
model_name = ['Linear Regression', 'Ridge', 'Random Forest', 'XG Boost']
models = [
    LinearRegression(),
    Ridge(),
    RandomForestRegressor(n_estimators=100,max_depth=9,n_jobs=-1),
    XGBRegressor(n_estimators = 150,learning_rate = 0.045)
]
y_out = list()
error = 100.0
for name,model in zip(model_name,models):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    pred = plot_cal_result(name,y_test,y_pred)
    y_out.append(pred)
    if error > pred:
        error = pred
    
for pred,name,model in zip(y_out,model_name,models):
    if pred == error:
        print("{} is the best model with score {}".format(name,pred))
        
end = time.time()

print("Time Taken {} seconds".format(end - start))

In [None]:
y_pred = xg.predict(test)

In [None]:
submission.head()

In [None]:
submission['target'] = y_pred

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv')