## (Tabular Playground Series - Feb 2021 Competition.)
#### by (Peter Gamal Girgis)

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os       
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression,Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
import xgboost

In [None]:
# open train Dataframe CSV file
path = '/kaggle/input/tabular-playground-series-feb-2021/'
X = pd.read_csv(path + '/train.csv', index_col='id')
X.head()

In [None]:
# open test Dataframe CSV file
X_test_full = pd.read_csv(path + '/test.csv', index_col='id')
X_test_full.head()

In [None]:
# open submission Dataframe CSV file
submission = pd.read_csv(path + '/sample_submission.csv', index_col='id')
submission.head()

In [None]:
# Remove rows with missing target, separate target from predictors
X = X.dropna(axis=0, subset=['target'])
y = X.target
X = X.drop(['target'], axis=1)

# We will drop columns with missing values if founded (not in this DF)
cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
X = X.drop(cols_with_missing, axis=1)
X_test = X_test_full.drop(cols_with_missing, axis=1)

# Break off Validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [None]:
X_train_full

In [None]:
# Select categorical columns with relatively low cordinality
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype =='object']

# Select numeric columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64','float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [None]:
X_train

In [None]:
# oneHot encode the data
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_test = pd.get_dummies(X_test)

X_train, X_valid = X_train.align(X_valid, join='left', axis=1)
X_train, X_test = X_train.align(X_test, join='left', axis=1)

In [None]:
print(X_train.shape, X_valid.shape, X_test.shape)

In [None]:
# We can improve the model ==> 4th improvement(Best Model)
my_model = LGBMRegressor(n_estimators=980, learning_rate=0.0324, n_jobs=4)

# Fit the model
my_model.fit(X_train, y_train) # early_stopping_rounds=3, eval_set=[(X_valid, y_valid)], verbose=False

# Get Predictions
predictions = my_model.predict(X_valid)

# Calculate MAE
mae = mean_absolute_error(predictions, y_valid)
print('Mean Absolute Error: ', mae)

# RMSE
rmse = np.sqrt(mean_squared_error(y_valid,predictions))
print('\nRoute Mean Square Error: ', rmse)

# Preprocessing of test data, fit model
preds_test = my_model.predict(X_test)

# plot: Best model
plt.scatter(y_valid, predictions)
plt.title('LGBMRegressor model',weight = 'bold', size = 15)
plt.show();

In [None]:
# save New DataFrame in CSV format at new created folder
output = pd.DataFrame({'id': X_test.index, 'target':preds_test})
output.to_csv('LGBMRegressor model.csv', index = False)