In [None]:

from pathlib import Path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
#EDA exploratoring data analisis

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

%matplotlib inline 

from sklearn.preprocessing import OneHotEncoder

# Models from Scikit-Learn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor

## Model evaluators
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_log_error, mean_absolute_error
        
input_path = Path('/kaggle/input/tabular-playground-series-feb-2021/')



In [None]:
submission = pd.read_csv(input_path / 'sample_submission.csv', index_col='id')
display(submission.head())

In [None]:
train = pd.read_csv(input_path / 'train.csv', index_col='id')
display(train.head())

In [None]:
test = pd.read_csv(input_path / 'test.csv', index_col='id')
display(test.head())

In [None]:
len(train)

In [None]:
len(test)

In [None]:
train.shape

In [None]:
test.shape

In [None]:
fig, ax = plt.subplots()
ax.scatter(train["cat0"][:100], train["target"][:100])

In [None]:
train.plot(kind="scatter", x="cat1", y="target", alpha=0.1)

In [None]:
train.target.plot.hist()

In [None]:
train["cat0"].value_counts()

In [None]:
 import seaborn as sns

corr_matrix = train.corr()
corr_matrix
    
fig, ax = plt.subplots(figsize=(15, 10))
ax = sns.heatmap(corr_matrix,
                 annot=True,
                 linewidths=0.5,
                 fmt=".3f",
                 cmap="YlGnBu")

In [None]:
corr_matrix["target"].sort_values(ascending=False)

## Prepare the data for Machine Learning algorithms

### Prepare the categorical

In [None]:
train.info()

In [None]:
from sklearn.preprocessing import LabelEncoder
df=train
for c in df.columns:
    if df[c].dtype=='object': 
        lbl = LabelEncoder()
        df[c]=df[c].fillna('N')
        lbl.fit(list(df[c].values))
        df[c] = lbl.transform(df[c].values)
        
X=df



In [None]:
df=test
for c in df.columns:
    if df[c].dtype=='object': 
        lbl = LabelEncoder()
        df[c] = df[c].fillna('N')
        lbl.fit(list(df[c].values))
        df[c] = lbl.transform(df[c].values)
        
test=df

In [None]:
y = X["target"].copy()
X = X.drop("target", axis=1)

In [None]:
test[:10]

## Modeling

In [None]:
# Split data into train and test sets
np.random.seed(42) # Makes the random numbers predictable

# Split into train & test set
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,                                                   
                                                    test_size=0.2)

### Scores

In [None]:
def rmse(model):
    yhat = model.predict(X_test)
    mse = mean_squared_error(y_test, yhat)
    score = {"Training root mean squared error ": np.sqrt(mse)}
    pred = {"Predictions  ": yhat}
    
    return score
    

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

### LinearRegression Model

In [None]:
model_lr = LinearRegression(n_jobs=-1)
model_lr.fit(X_train, y_train)    
rmse(model_lr)

In [None]:
pickle.dump(model_lr, open('linear_regression_model', 'wb'))

### SGDRegressor Model

In [None]:
model_sdg = SGDRegressor(max_iter=100, tol=1e-3)    
model_sdg.fit(X_train, y_train)
rmse(model_sdg)

### cross_val_score with Linear Regressor

In [None]:
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(model_lr, X_train, y_train,
                                scoring="neg_mean_squared_error", cv=10, verbose=True, n_jobs=-1)
regressor_forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(regressor_forest_rmse_scores)

### RandomForestRegressor Model

In [None]:
model_rf = RandomForestRegressor(n_jobs=-1, verbose=True, n_estimators=100)                          
model_rf.fit(X_train, y_train)


In [None]:
rmse(model_rf)

In [None]:
model_rf.predict(X_test)

In [None]:
pickle.dump(model_rf, open('finalized_model_rf.pkl', 'wb'))

### Hyperparameters Tuning with RandomizedSearchCV RandomForestRegressor

In [None]:

# Different RandomForestClassifier hyperparameters
param_distribs = {"n_estimators": np.arange(10, 100, 10),
                  "max_depth": [None, 3, 5, 10, 20],    # number of splits
                  "min_samples_split": np.arange(2, 20, 2),
                  "min_samples_leaf": np.arange(1, 20, 2),
                  "max_features": [0.5, 1, "sqrt", "auto"], # number of input(features) per tree
                 }
                  #"max_samples": [3000]}

rs_model_rf = RandomizedSearchCV(RandomForestRegressor(n_jobs=-1),
                              param_distributions=param_distribs,
                              n_iter=5,scoring='neg_mean_squared_error',
                              cv=2,  random_state=42)

rs_model_rf.fit(X_train, y_train)

In [None]:
rs_model_rf.best_params_

In [None]:
negative_mse = rs_model_rf.best_score_
rmse = np.sqrt(-negative_mse)
rmse


In [None]:
rs_model_rf.predict(X_test)

In [None]:
 pickle.dump(rs_model_rf, open('finalized_model_rf_cv.pkl', 'wb'))

### Hyperparameters Tuning with RandomizedSearchCV and SGDRegressor.

In [None]:

param_distribs = {
    'alpha': 10.0 ** -np.arange(1, 3),
    #'loss': ['squared_loss', 'huber', 'epsilon_insensitive'],
    'loss': ['squared_loss', 'huber'],
    'penalty': ['l2', 'l1'],
    'learning_rate': ['constant', 'optimal'],
    'max_iter': np.arange(1, 100)
}

clf = RandomizedSearchCV(SGDRegressor(),
                              param_distributions=param_distribs,
                              n_iter=5,scoring='neg_mean_squared_error',
                              cv=2,  random_state=42)

clf.fit(X_train, y_train)
print("Best score: " + str(clf.best_score_))


In [None]:
clf.best_params_

In [None]:
negative_mse = clf.best_score_
rmse = np.sqrt(-negative_mse)
rmse

In [None]:
pickle.dump(model_sdg, open('finalized_rs_model_sdg.pkl', 'wb'))

In [None]:
submission['target'] = model_sdg.predict(test)
submission.to_csv('SGDRegressor.csv')