# **Random Forest Assignment - Machine Learning Section**


### **Group Members:** Hailah AlHarthi, Khalid AlMalki, and Shaikha AlBilais

# **Libraries Importing**

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import *
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.tree import plot_tree
from sklearn import tree

from scipy.stats import *

sns.set_palette('Set2')

# **Data loading**

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/test.csv")
sample_submission = pd.read_csv('../input/tabular-playground-series-feb-2021/sample_submission.csv')

### Look at data...

In [None]:
train.tail()

In [None]:
test.tail()

In [None]:
sample_submission.tail()

In [None]:
len(sample_submission),len(test)

# Do your stuff...

### Deleverables
- EDA
    - What's going on?
    - Show me the data...
    - Bonus: Other / Etc...
- Model
    - [RandomForestRegressor()](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)
    - Bonus: [KNeighborsRegressor()](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html)
    - Bonus: Other / Etc...
- Evaluation
    - mean_squared_error()
    - Bonus: Other / Etc...
- RAPIDS Bonus
    - Apply RAPIDS ([Starter Notebook](https://www.kaggle.com/tunguz/tps-feb-2021-rapids-starter))
    - Replace pandas with cuDF & sklearn with cuML

# **EDA**

## **Exploring Data**

In [None]:
train.info()

In [None]:
train.shape

In [None]:
train.describe()

In [None]:
train.describe(exclude='number')

## **Null Values**

In [None]:
fig=plt.figure(figsize=(25,11))
col=['id','target']
sns.boxplot(data=train.drop(columns=col,axis=1))
plt.show()

In [None]:
fig=plt.figure(figsize=(25,11))
sns.boxplot(data=train.target)
plt.show()

In [None]:
sns.pairplot(data=train.sample(100))

In [None]:
corr=train.corr()
corr.style.background_gradient(cmap='coolwarm')

# **Data Preparing**

## **Data Cleaning**

In [None]:
train=train[(train['cont0']>train['cont0'].quantile(.05))&
      (train['cont2']>train['cont2'].quantile(.05))&
      (train['cont2']<train['cont2'].quantile(.95))&
      (train['cont6']<train['cont6'].quantile(.95))&
      (train['cont8']<train['cont8'].quantile(.95))&     
      (train['target']<train['target'].quantile(.95))&
      (train['target']>train['target'].quantile(.05))]

In [None]:
train['cat6'].value_counts()

In [None]:
fig=plt.figure(figsize=(25,11))
col=['id']
sns.boxplot(data=train.drop(columns=col,axis=1))
plt.show()

In [None]:
train.shape

In [None]:
train=pd.get_dummies(data=train)
test=pd.get_dummies(data=test)

## **Data Splitting**

In [None]:

col=['cat6_G','target']
X=train.drop(col,axis=1)
y=train.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 30)


## **Data Scaling**

In [None]:
standrd = StandardScaler()
col=X_train.columns
X_train = standrd.fit_transform(X_train)
X_train

In [None]:
X_test=standrd.transform(X_test)
X_test

# **Data Modeling**

### **Random Forest Regressor**

In [None]:
rfr = RandomForestRegressor()
rfr.fit(X_train,y_train)

In [None]:
rf_preds =rfr.predict(X_test)
rf_preds

In [None]:
rf_mae=mean_absolute_error(y_test , rf_preds)
rf_mse=mean_squared_error(y_test , rf_preds , squared=False)
rf_r2=r2_score(y_test , rf_preds)

In [None]:
print('RF Mean Absolute Error:', rf_mae)
print('RF Root Squared Error:', rf_mse)
print('RF R2 Score:', rf_r2)

### **KNN**

In [None]:
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)

In [None]:
pred_knn = knn.predict(X_test)
pred_knn

In [None]:
knn_mae=mean_absolute_error(y_test , pred_knn)
knn_mse=mean_squared_error(y_test , pred_knn,squared=False)
knn_r2=r2_score(y_test , pred_knn)

In [None]:
print('KNN Mean Absolute Error:', knn_mae)
print('KNN root Squared Error:', knn_mse)
print('KNN R2 Score:', knn_r2)

### **Linear Regression**

In [None]:
lir=LinearRegression()
lir.fit(X_train, y_train)

In [None]:
lir_preds=lir.predict(X_test)

In [None]:
lir_mae = mean_absolute_error(y_test , lir_preds)
lir_mse = mean_squared_error(y_test, lir_preds  , squared=False)
lir_r2 = r2_score(y_test , lir_preds)

In [None]:
print('LR Mean Absolute Error:', lir_mae)
print('LR Root Root Error:', lir_mse)
print('LR R2 Score:', lir_r2)

### **SGD Regression**

In [None]:
sgdr= SGDRegressor()
sgdr.fit(X_train, y_train)

In [None]:
sgdr_preds=sgdr.predict(X_test)

In [None]:
sgdr_mae = mean_absolute_error(y_test , sgdr_preds)
sgdr_mse = mean_squared_error(y_test, sgdr_preds , squared=False)
sgdr_r2 = r2_score(y_test , sgdr_preds)

In [None]:
print('SGD Mean Absolute Error:', sgdr_mae)
print('SGD Root Squared Error:', sgdr_mse)
print('SGD R2 Score:', sgdr_r2)

# **Data Optimizing**

* As we noticed that the Linear Regression Model have the lowest MSE, we will use the grid to optimize it.

In [None]:
p_grid = {'fit_intercept': [True, False],'normalize': [True, False],}
grid = GridSearchCV(lir, p_grid,cv=10)

grid.fit(X_train, y_train)

In [None]:
best = grid.best_params_
print('The best parameters for the model are:', best)

In [None]:
lirg=LinearRegression(fit_intercept=best['fit_intercept'], normalize=best['normalize'])
lirg.fit(X_train, y_train)

In [None]:
lir_preds2=lirg.predict(X_test)

In [None]:
lirg_mae = mean_absolute_error(y_test , lir_preds2)
lirg_mse = mean_squared_error(y_test , lir_preds2 , squared=False)
lirg_r2 = r2_score(y_test , lir_preds2)

In [None]:
print('After Optimization')
print('LR Mean Absolute Error:',lirg_mae )
print('LR Root Squared Error:',lirg_mse )
print('LR R2 Score:',lirg_r2 )

**Selecting the best model**

In [None]:
best_score= {'Random Forest Regressor model':rf_mse,
                 'KNeighbors Regressor model':knn_mse,
                 'Linear Regressor model':lir_mse,
                 'SGD Regressor model':sgdr_mse,
                 'Linear Regressor After Randomized Optimization':lirg_mse,
                 }


In [None]:
min_key = min(best_score, key=best_score.get)
min_value =best_score[min_key]

In [None]:


print(f'As seen in the above scores tests, we select {min_key} with a mean squared error score:{min_value}')


**Predicting the Test dataset using the best model**

In [None]:
lir_test_pred=lir.predict(test)

In [None]:
lir_test_pred

Submitting the prediction result

In [None]:
%%time
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import *
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.tree import plot_tree
from sklearn import tree

from scipy.stats import *


# data load
train = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/test.csv")
sample_submission = pd.read_csv('../input/tabular-playground-series-feb-2021/sample_submission.csv')

# data prep
X = train.drop('target', axis=1)
X = pd.get_dummies(X)

y = train.target

test = pd.get_dummies(test)
test['cat6_G'] = 0  # fix lack of Gs in test data

# modeling
rfr = RandomForestRegressor()
rfr.fit(X,y)
 
rf_preds =rfr.predict(test)

# save results & submit
sample_submission['target'] = rf_preds

sample_submission.to_csv('submission.csv', index=False)
