<a href="https://colab.research.google.com/github/silviootero/Proyecto-sustituto-Modelos1/blob/main/Copia_de_XGB_notebook_New.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Seoul Bike Rental

#### Importing necessary libraries

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_log_error
from scipy import stats

#### Loading datasets into notebook

In [None]:
dir_path = '/kaggle/input/seoul-bike-rental-ai-pro-iti'

df_train = pd.read_csv(os.path.join(dir_path, 'train.csv'))
df_test = pd.read_csv(os.path.join(dir_path, 'test.csv'))

df_test_ids = df_test['ID']
df_train = df_train.drop(columns = ['ID'])
df_test = df_test.drop(columns = ['ID'])

In [None]:
df_train.head()

In [None]:
df_test.head()

#### Reviewing data

In [None]:
df_train.columns

In [None]:
df_train.dtypes

### Fixing names of Temperature columns


In [None]:
df_train = df_train.rename(columns = {df_train.columns[3] : 'Temperature (C)', df_train.columns[7] : 'Dew point temperature (C)'})
df_test = df_test.rename(columns = {df_test.columns[2] : 'Temperature (C)', df_test.columns[6] : 'Dew point temperature (C)'})
#df_train = df_train[df_train['Functioning Day'] != 'No']
#df_train.drop(columns='Functioning Day', inplace=True)
df_train.columns

In [None]:
df_train.describe()

In [None]:
df_train.isna().sum()

There are no missing values, so..That's a good start

In [None]:
df_train.var().sort_values(ascending = False)

We should keep an eye for 'Snowfall' and 'Solar Radiation' columns, Cause with a variance this low they might be adding an insignificant amount of information

In [None]:
# we should drop after visualization so this cell should be moved
#df_temp = df_train.drop(columns=["Holiday", "Date", "Rainfall(mm)", "Wind speed (m/s)", Snowfall (cm)"])

In [None]:
def add_working_hour_column(df):
    df["working_hour"] = 0
    df["working_hour"] = ((df["Hour"] >= 5) & (df["Hour"] <= 20)).astype(int)
    return df

In [None]:
df_train['Month'] = pd.DatetimeIndex(df_train['Date']).month
df_train['Day'] = pd.DatetimeIndex(df_train['Date']).day
df_train['Weekday'] = pd.DatetimeIndex(df_train['Date']).weekday
df_test['Month'] = pd.DatetimeIndex(df_test['Date']).month
df_test['Day'] = pd.DatetimeIndex(df_test['Date']).day
df_test['Weekday'] = pd.DatetimeIndex(df_test['Date']).weekday

In [None]:
#df_train["m_d_h"] = df_train["Month"] * 30 + df_train["Day"] * 24 + df_train["Hour"]
#df_test["m_d_h"] = df_test["Month"] * 30 + df_test["Day"] * 24 + df_test["Hour"]
df_train.head()

In [None]:
df_train=add_working_hour_column(df_train)
df_test=add_working_hour_column(df_test)

# Some EDA

#### We'll plot the scatter plot for some selected columns

In [None]:
# feature=['y', 'Solar Radiation (MJ/m2)']
# # IQR
# Q1 = np.percentile(df_train[feature], 25,
#                    interpolation = 'midpoint',axis=0)
# print(Q1)
# Q3 = np.percentile(df_train[feature], 75,
#                    interpolation = 'midpoint',axis=0)
# print(Q3)
# IQR = Q3 - Q1

# print("Old Shape: ", df_train.shape)

# # Upper bound
# upper = np.where(df_train[feature] >= (Q3+1.5*IQR))
# # Lower bound
# lower = np.where(df_train[feature] <= (Q1-1.5*IQR))

# #Removing the Outliers
# df_train.drop(upper[0], inplace = True, axis=0)
# df_train.drop(lower[0], inplace = True, axis=0)

# print("New Shape: ", df_train.shape)

# **Encoding Categorical Columns**

In [None]:
def get_temp_range(temp_val):
    counter=1
    for i in range(-20,41,10):
        if temp_val <= i :
            return counter
        counter+=1
    return 0

In [None]:
# df_train["temp_range"]=df_train['Temperature (C)'].apply(get_temp_range)
# df_test["temp_range"]=df_test['Temperature (C)'].apply(get_temp_range)
df_train.head()

In [None]:
def encode_categroical_features(df):
    df["Seasons"] = df["Seasons"].astype("category").cat.codes
    df["Functioning Day"] = df["Functioning Day"].astype("category").cat.codes
    df["Holiday"] = df["Holiday"].astype("category").cat.codes
    return df


In [None]:
def pca_3_components(df, feature1, feature2, feature3,  new_col_name,df_test):
    to_be_transformed = df[[feature1, feature2, feature3]]
    to_be_transformed_test = df_test[[feature1, feature2, feature3]]
    pca = PCA(n_components=1)
    transformed_components = pca.fit_transform(to_be_transformed)
    df[new_col_name] = transformed_components
    df_test[new_col_name]=pca.transform(to_be_transformed_test)
    df.drop(columns=[feature1, feature2, feature3],inplace=True)
    df_test.drop(columns=[feature1, feature2, feature3],inplace=True)
    return df

In [None]:
def filter_functioning_day(df):
    df_columns=df.columns
    for col in df_columns:
        df[col]=df[col]*df['Functioning Day']
    return df

In [None]:
def replace_outlaires(df):
    for feature in df.drop(columns=["Hour", "Month", "Day",'Functioning Day','Seasons']).columns:
        # IQR
        Q1 = np.percentile(df[feature], 25,
                           interpolation = 'midpoint')
        Q3 = np.percentile(df[feature], 75,
                           interpolation = 'midpoint')
        IQR = Q3 - Q1

        upperL = Q3 + 1.5*IQR
        lowerL = Q1 - 1.5*IQR
        df[feature] = df[feature].map(lambda val: (val if val < upperL else upperL))
        df[feature] = df[feature].map(lambda val: (val if val > lowerL else lowerL))

    print("New Shape: ", df.shape)
    return df

In [None]:
 def pre_processing(df):
    columns_to_drop=['Date', 'Snowfall (cm)', 'Holiday', 'Wind speed (m/s)']
    df=encode_categroical_features(df)
    df=df.drop(columns=columns_to_drop)
#     _ = pca_3_components(df, "Day", "Month", "m_d_h", "D_M",df_test)
    #df=filter_functioning_day(df)
    return df

# **Now let's see heat map to check for corrolation**

In [None]:
#corr_mat = df_train.corr()
#fig = plt.figure(figsize = (14, 14))
#sns.heatmap(corr_mat, annot= True)
#plt.show()

* We can see strong corrolation between temp and dew point temp, so we can drop dew point
* Snowfall, Rainfall, Holiday and FunctioningDay can be droped

In [None]:
from sklearn.decomposition import PCA
def pca_2_components(df, feature1, feature2,  new_col_name , df_test):
    to_be_transformed = df[[feature1, feature2]]
    to_be_transformed_test = df_test[[feature1, feature2]]
    pca = PCA(n_components=1)
    transformed_components = pca.fit_transform(to_be_transformed)
    df[new_col_name] = transformed_components
    df_test[new_col_name]= pca.transform(to_be_transformed_test)
    df.drop(columns=[feature1,feature2],inplace=True)
    df_test.drop(columns=[feature1,feature2],inplace=True)
    return df

In [None]:
pca_2_components(df_train, 'Dew point temperature (C)', 'Temperature (C)', 'temp_pca',df_test)
#pca_2_components(df_train, 'Solar Radiation (MJ/m2)', 'working_hour', 'solar_work_pca',df_test)
columns_to_drop_aftePCA=['Dew point temperature (C)', 'Temperature (C)']

In [None]:
df_temp =pre_processing(df_train)

In [None]:
df_temp=replace_outlaires(df_temp)

In [None]:
# corr_mat = df_temp.corr()
# fig = plt.figure(figsize = (14, 14))
# sns.heatmap(corr_mat, annot= True)
# plt.show()

# # **Model**

### Functioning Day filteration

In [None]:

df_temp[df_temp['Functioning Day']==0]

In [None]:
#df_temp['y'] = np.log1p(df_temp['y'])

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(df_temp.drop(columns=['y', 'Month', 'Day']), df_temp["y"], test_size=0.005, random_state=42)

In [None]:
df_temp.describe()

### Apply Feature Scaling

In [None]:

# scaler=MinMaxScaler()
# X_train=scaler.fit_transform(X_train)
# X_test=scaler.transform(X_test)


In [None]:
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)


In [None]:
#print(pd.DataFrame(X_train).describe())

In [None]:
'''

from sklearn.ensemble import ExtraTreesRegressor

regr = ExtraTreesRegressor(random_state=0)
regr.fit(X_train, y_train)

y_pred=regr.predict(X_test)

print(regr.score(X_test, y_test))

y_test, y_pred = np.expm1(y_test), np.expm1(y_pred)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
print(rmsle)
'''

In [None]:
#len(np.where(y_pred < y_test)[0])

# **XGB**

In [None]:
from xgboost import XGBRegressor
XGBModel = XGBRegressor(objective="reg:tweedie", tweedie_variance_power=1.6, gamma=2, max_depth=6, subsample=.7, reg_alpha=0.15, reg_lambda=1, learning_rate= 0.15)

In [None]:
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)
XGBModel = XGBModel.fit(X_train, y_train, verbose=False)
print(XGBModel.score(X_test, y_test))

In [None]:
y_test

In [None]:
y_pred = XGBModel.predict(X_test)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
print(rmsle)

In [None]:
len(np.where(y_pred < y_test)[0])

In [None]:
print(df_temp[df_temp["y"] < 0])

In [None]:
sns.histplot(df_temp["y"])

In [None]:
sns.histplot(y_train, bins=20)

In [None]:
sns.histplot(y_test, bins=20)

In [None]:
sns.histplot(y_pred, bins=20)

In [None]:
sns.scatterplot(y_pred, y_test)

In [None]:
sns.histplot(y_pred-y_test)

In [None]:
print(np.sum(np.abs(y_pred - y_test) > 100))

In [None]:
len(y_test)

In [None]:
df_test.head()

In [None]:

df_test=pre_processing(df_test)

In [None]:
df_test.head()

In [None]:
df_test[df_test['Functioning Day']==0]

In [None]:
X_test = df_test.drop(columns=['Month','Day'])
X_test=scaler.transform(X_test)
# You should update/remove the next line once you change the features used for training
y_test_predicted = XGBModel.predict(X_test)
df_test['y'] = y_test_predicted
df_test['ID']=df_test_ids
df_test.head()
df_test[['ID', 'y']].to_csv('/kaggle/working/submission.csv', index=False)

In [None]:
X_test = df_test.drop(columns=['Month','Day'])
X_test.head()

In [None]:
df_test.describe()

In [None]:
df_test[df_test['Functioning Day']==0]