In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import metrics
sns.set()
%matplotlib inline
plt.style.use('dark_background')
import warnings
warnings.filterwarnings("ignore")

In [None]:
train_data = pd.read_csv("../input/tabular-playground-series-aug-2021/train.csv")

In [None]:
train_data.dtypes

In [None]:
train_data.head()

# Missing Values 

In [None]:
print(f'Number of rows: {train_data.shape[0]};  Number of columns: {train_data.shape[1]};  No of missing values: {sum(train_data.isna().sum())}')

# Describing the basic Statistics 

In [None]:
train_data.describe(include="all").T.style.bar(subset=['max'], color='#d35400')\
                 .background_gradient(subset=['min','50%','75%'], cmap='Blues')

In [None]:
print('Loss column basic statistics:')
print(train_data["loss"].describe())
print("unique values:", train_data["loss"].nunique())

In [None]:
print('Frequency of loss column values: ')
train_data.loss.value_counts()

# Data types

In [None]:
# pd.options.display.max_rows =  None
train_data.dtypes

In [None]:
int_train = train_data.select_dtypes(include=['int64'])
int_train

In [None]:
pd.options.display.max_columns = None
float_train = train_data.select_dtypes(include=['float64'])
float_train

# Distribution float types

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution integer types values in train", fontsize=20)
sns.distplot(int_train.mean(axis=1),color="green", kde=True,bins=150)
plt.ylabel("Density", fontsize=15)
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.show()

# Distribution float types

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution float types values in train", fontsize=20)
sns.distplot(float_train.mean(axis=1),color="red", kde=True,bins=150)
plt.ylabel("Density", fontsize=15)
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.show()

# Cleaning the train data

In [None]:
train_data = train_data.abs().astype(np.int64)

In [None]:
train_data

In [None]:
sns.set()
fig,ax = plt.subplots(figsize=(22, 12.5))
ax = sns.barplot(x = train_data["loss"].value_counts().sort_index(),y = train_data["id"],
                 ax = ax,data = train_data ,palette = "deep")
ax.set_xlabel("Loss",fontsize = 20,fontweight = 'bold')
ax.set_ylabel("Density",fontsize = 20,fontweight = 'bold')
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.1f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'bottom', 
                   xytext = (10, 9), 
                   textcoords = 'offset points',rotation = 0,fontsize = 15)
plt.xticks(rotation = 90, fontsize= 20)
plt.yticks(fontsize= 20)

plt.title("Loss ",y = 1.01,fontsize = 25,fontweight = 'bold')
plt.show()


# Test data

In [None]:
test_data = pd.read_csv("../input/tabular-playground-series-aug-2021/test.csv")

In [None]:
test_data.head()

In [None]:
print(f'Number of rows: {test_data.shape[0]}; Number of columns: {test_data.shape[1]}; No of missing values: {sum(test_data.isna().sum())}')

# Describing the basic Statistics 

In [None]:
test_data.describe(include="all").T.style.bar(subset=['max'], color='#d35400')\
                 .background_gradient(subset=['min','50%','75%'], cmap='Blues')

# Data types

In [None]:
test_data.dtypes

In [None]:
int_test = test_data.select_dtypes(include=['int64'])
int_test

In [None]:
pd.options.display.max_columns = None
float_test = test_data.select_dtypes(include=['float64'])
float_test

# Distribution integer types

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution integer types values in train", fontsize=20)
sns.distplot(int_test.mean(axis=1),color="blue", kde=True,bins=150)
plt.ylabel("Density", fontsize=15)
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.show()

# Distribution Float types

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution Float types values in train", fontsize=20)
sns.distplot(float_test.mean(axis=1),color="orange", kde=True,bins=150)
plt.ylabel("Density", fontsize=15)
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.show()

# Cleaning the data

In [None]:
test_data = test_data.abs().astype(np.int64)

In [None]:
test_data

In [None]:
test_data.drop('id',axis = 1, inplace = True)

In [None]:
# Setting Targets and Inputs

In [None]:
targets = train_data["loss"]
inputs = train_data.drop(["id","loss"],axis = 1)

In [None]:
# Spliting the data 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test , y_train , y_test = train_test_split(inputs,targets,test_size=0.2,random_state=123) 

print('X_train:', X_train.shape)
print('X_train:', X_test.shape)
print('y_test:', y_train.shape)
print('y_test:', y_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
sc.fit(X_train)
X_train=sc.transform(X_train)
X_test=sc.transform(X_test)

In [None]:
# Modeling 

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
reg = LinearRegression()
reg.fit(X_train,y_train)

In [None]:
prediction = reg.predict(X_test)

In [None]:
reg.intercept_

In [None]:
reg.coef_

In [None]:
plt.style.use("ggplot")
plt.style.use("dark_background")
plt.figure(figsize=(10,10))
sns.regplot(y_test, prediction,fit_reg=True, scatter_kws={"s": 100},scatter=True,
            color="blue")
plt.xlabel("Loss", fontsize = 17)
plt.show()

# Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor.fit(inputs,targets)

In [None]:
preds = regressor.predict(inputs)

In [None]:
print("Accuracy: ",regressor.score(inputs,targets))

In [None]:
plt.style.use("ggplot")
plt.style.use("dark_background")
plt.figure(figsize=(10,10))
sns.regplot(targets, preds,fit_reg=True, scatter_kws={"s": 100},scatter=True,color="red")
plt.xlabel("Loss", fontsize = 17)
plt.show()

# CatBoost Model

In [None]:
import catboost as ctb

In [None]:
cbr = ctb.CatBoostRegressor()
cbr.fit(inputs,targets)

In [None]:
preds3 = cbr.predict(inputs)

In [None]:
print("r2 score: ",metrics.r2_score(targets, preds3))
print("mse: ",np.log(metrics.mean_squared_log_error(targets, preds3)))

In [None]:
plt.style.use("ggplot")
plt.style.use("dark_background")
plt.figure(figsize=(10,10))
sns.regplot(targets, preds3,fit_reg=True, scatter_kws={"s": 100},scatter=True,color="green")
plt.xlabel("Loss", fontsize = 17)
plt.show()

In [None]:
from lightgbm import LGBMRegressor

In [None]:
ltb = LGBMRegressor(objective='regression')
ltb.fit(inputs, targets)

In [None]:
preds4 = ltb.predict(inputs)

In [None]:
# Manually r2 score 

In [None]:
er = []
g = 0
for i in range(len(targets)):
    print( "actual=", targets[i], " observed=", preds4[i])
    x = (targets[i] - preds4[i]) **2
    er.append(x)
    g = g + x

In [None]:
m = np.mean(targets)
y = 0
for i in range(len(targets)):
    y = y + ((targets[i] - m) ** 2)

In [None]:
print(1 - (g / y))
print(np.log((metrics.mean_squared_log_error(targets, preds4))))

In [None]:
plt.style.use("ggplot")
plt.style.use("dark_background")
plt.figure(figsize=(10,10))
sns.regplot(targets, preds4,fit_reg=True, scatter_kws={"s": 100},scatter=True,color="purple")
plt.xlabel("Loss", fontsize = 17)
plt.show()

In [None]:
sub_data = pd.read_csv("../input/tabular-playground-series-aug-2021/sample_submission.csv")

In [None]:
predt = ltb.predict(test_data)

In [None]:
sub_data['loss'] = predt

In [None]:
sub_data.head(50)

In [None]:
sub_data.to_csv("final_submission.csv",index = False)