# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style ='whitegrid')
pd.set_option('display.max_columns',None)

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

# Reading Data

In [None]:
data=pd.read_csv("../input/tabular-playground-series-mar-2022/train.csv")
test=pd.read_csv("../input/tabular-playground-series-mar-2022/test.csv")

In [None]:
data.shape

In [None]:
test.shape

In [None]:
data.head()

In [None]:
#Dataframe summary
pd.DataFrame({'unicos':data.nunique(),
              'missing': data.isna().sum(),
              'tipo':data.dtypes})

In [None]:
data.describe()

# Data Processing

In [None]:
def ShowDetails():
    global data
    for col in data.columns : 
        print(f'for feature {col}')
        print(f'Number of Nulls is {data[col].isna().sum()}')
        print(f'Number of Unique Values is {len(data[col].unique())}')
        print(f'Unique Values is {data[col].unique()}')
        print(f'Random Value is {data[col][np.random.randint(data.shape[0])]}')
        print(f'Random Value is {data[col][np.random.randint(data.shape[0])]}')
        print(f'Random Value is {data[col][np.random.randint(data.shape[0])]}')
        print('\n\n==================================\n\n')

In [None]:
ShowDetails()

In [None]:
#time
#convert feature of time from object to datetime
data["time"]=pd.to_datetime(data["time"],format="%Y-%m-%d %H:%M")

In [None]:
#extract hour and day and month from time
data["Hour"]=data["time"].dt.hour
data["Day"]=data["time"].dt.day
data["WeekDay"]=data["time"].dt.day_name()
data["Month"]=data["time"].dt.month
data['WeekDayCase'] = data['WeekDay'].apply(lambda x : 'WeekEnd' if str(x).lower() in ['saturday','sunday'] else 'WeekDay')
data

In [None]:
data['WeekDayCase'].value_counts()

In [None]:
data['WeekDay'].value_counts()

In [None]:
def DayPart(H) :
    
    if H <6 : 
        return 'Early Morning'
    elif H< 12 : 
        return 'Morning'
    elif H< 15 : 
        return 'Noon'
    elif H< 19 : 
        return 'Evening'
    else : 
        return 'Night'

In [None]:
data['DayPart'] = data['Hour'].apply(lambda x : DayPart(int(x)))

In [None]:
data['DayPart'].value_counts()

In [None]:
data["Month"].unique()

In [None]:
data.head()

In [None]:
#feature of "x"
data["x"].value_counts()

In [None]:
#feature of "y"
data["y"].value_counts()

In [None]:
#direction
data["direction"].unique()

In [None]:
#congestion
data["congestion"].max(),data["congestion"].min()

In [None]:
def congestionLevel(x) : 
    x = int(x)
    if x < 30 : 
        return 'Low'
    elif x <60 : 
        return 'Medium'
    else :
        return 'High'
data['congestionLevel'] =data['congestion'] .apply(lambda x : congestionLevel(x))
data['congestionLevel'].value_counts()

# Data Visualization

In [None]:
#Visualization of countplot
def CPlot(feature) : 
    global data
    fig, ax = plt.subplots(figsize=(10,6))
    sns.countplot(x=feature, data=data,facecolor=(0, 0, 0, 0),
                  linewidth=5,edgecolor=sns.color_palette("dark", 3))
    ax.set_xticklabels(ax.get_xticklabels(),rotation = 30)
    
#Visualization of kdeplot    
def KPlot(feature,Limit=0) : 
    global data
    fig, ax = plt.subplots(figsize=(10,6))
    if Limit == 0 : 
        sns.kdeplot(data[feature], shade=True)
    else : 
        Data = data[data[feature]<=Limit]
        sns.kdeplot(Data[feature], shade=True)
        
 #Visualization of boxplot       
def BPlot(feature1,feature2 = None,hue = None) : 
    global data
    fig, ax = plt.subplots(figsize=(10,6))
    if feature2 ==  None and hue == None : 
        sns.boxplot(data[feature1],width=0.3,color='r')
    elif  feature2 !=  None and hue == None :
        sns.boxplot(x = data[feature1],y=data[feature2],width=0.3,color='r')
    elif  feature2 !=  None and hue != None :
        sns.boxplot(x = data[feature1],y=data[feature2],hue=data[hue],width=0.3,color='r')
        
#Visualization of jointplot
def JPlot(feature1,feature2,Type = 'scatter') : 
    global data
    sns.jointplot(data=data, x=feature1, y=feature2, kind=Type)   
    
 #Visualization of pie   
def Pie(feature,Limit=20) : 
    global data
    fig, ax = plt.subplots(figsize=(10,6))
    plt.pie(data[feature].value_counts()[:Limit],labels=list(
        data[feature].value_counts()[:Limit].index),
        autopct ='%1.2f%%' , labeldistance = 1.1,
            explode = [0.05 for i in range(len(data[feature].value_counts()[:Limit]))] )
    plt.show()  

In [None]:
data.head()

In [None]:
CPlot("WeekDay")

In [None]:
CPlot("DayPart")

In [None]:
Pie("congestionLevel")

In [None]:
Pie("direction")

In [None]:
Pie("x")

In [None]:
Pie("y")

In [None]:
KPlot("congestion")

In [None]:
data.head()

In [None]:
def SelectedCongestionLevelPie(feature,Limit=10) : 
    global data
    fig, ax = plt.subplots(figsize=(15,4))
    
    plt.subplot(1,3,1)
    plt.title('High')
    Data = data[data['congestionLevel']=='High']
    plt.pie(Data[feature].value_counts()[:Limit],labels=list(
        Data[feature].value_counts()[:Limit].index),
        autopct ='%1.2f%%' , labeldistance = 1.1,
            explode = [0.05 for i in range(len(Data[feature].value_counts()[:Limit]))] )
  
    plt.subplot(1,3,2)
    plt.title('Medium')
    Data = data[data['congestionLevel']=='Medium']
    plt.pie(Data[feature].value_counts()[:Limit],labels=list(
        Data[feature].value_counts()[:Limit].index),
        autopct ='%1.2f%%' , labeldistance = 1.1,explode = [0.05 for i in range(len(Data[feature].value_counts()[:Limit]))] )
    
    plt.subplot(1,3,3)
    plt.title('Low')
    Data = data[data['congestionLevel']=='Low']
    plt.pie(Data[feature].value_counts()[:Limit],labels=list(
        Data[feature].value_counts()[:Limit].index),
        autopct ='%1.2f%%' , labeldistance = 1.1,
            explode = [0.05 for i in range(len(Data[feature].value_counts()[:Limit]))] )
    plt.show() 

In [None]:
SelectedCongestionLevelPie("DayPart")

In [None]:
SelectedCongestionLevelPie("WeekDayCase")

In [None]:
SelectedCongestionLevelPie("direction")

In [None]:
SelectedCongestionLevelPie("x")

In [None]:
SelectedCongestionLevelPie("y")

In [None]:
SelectedCongestionLevelPie("Month")

In [None]:
data["direction"].unique()

In [None]:
def SelecteddirectionlPie(feature,Limit=10) : 
    global data
    fig, ax = plt.subplots(figsize=(12,8))
    
    plt.subplot(2,4,1)
    plt.title('D EB')
    Data = data[data['direction']=='EB']
    plt.pie(Data[feature].value_counts()[:Limit],labels=list(
        Data[feature].value_counts()[:Limit].index),
        autopct ='%1.2f%%' , labeldistance = 1.1,
            explode = [0.05 for i in range(len(Data[feature].value_counts()[:Limit]))] )
  
    plt.subplot(2,4,2)
    plt.title('D NB')
    Data = data[data['direction']=='NB']
    plt.pie(Data[feature].value_counts()[:Limit],labels=list(
        Data[feature].value_counts()[:Limit].index),
        autopct ='%1.2f%%' , labeldistance = 1.1,explode = [0.05 for i in range(len(Data[feature].value_counts()[:Limit]))] )
    
    plt.subplot(2,4,3)
    plt.title('D SB')
    Data = data[data['direction']=='SB']
    plt.pie(Data[feature].value_counts()[:Limit],labels=list(
        Data[feature].value_counts()[:Limit].index),
        autopct ='%1.2f%%' , labeldistance = 1.1,
            explode = [0.05 for i in range(len(Data[feature].value_counts()[:Limit]))] )
    plt.subplot(2,4,4)
    plt.title('D WB')
    Data = data[data['direction']=='WB']
    plt.pie(Data[feature].value_counts()[:Limit],labels=list(
        Data[feature].value_counts()[:Limit].index),
        autopct ='%1.2f%%' , labeldistance = 1.1,
            explode = [0.05 for i in range(len(Data[feature].value_counts()[:Limit]))] )
    plt.subplot(2,4,5)
    plt.title('D NE')
    Data = data[data['direction']=='NE']
    plt.pie(Data[feature].value_counts()[:Limit],labels=list(
        Data[feature].value_counts()[:Limit].index),
        autopct ='%1.2f%%' , labeldistance = 1.1,
            explode = [0.05 for i in range(len(Data[feature].value_counts()[:Limit]))] )
    plt.subplot(2,4,6)
    plt.title('D SW')
    Data = data[data['direction']=='SW']
    plt.pie(Data[feature].value_counts()[:Limit],labels=list(
        Data[feature].value_counts()[:Limit].index),
        autopct ='%1.2f%%' , labeldistance = 1.1,
            explode = [0.05 for i in range(len(Data[feature].value_counts()[:Limit]))] )
    plt.subplot(2,4,7)
    plt.title('D NW')
    Data = data[data['direction']=='NW']
    plt.pie(Data[feature].value_counts()[:Limit],labels=list(
        Data[feature].value_counts()[:Limit].index),
        autopct ='%1.2f%%' , labeldistance = 1.1,
            explode = [0.05 for i in range(len(Data[feature].value_counts()[:Limit]))] )
    plt.subplot(2,4,8)
    plt.title('D SE')
    Data = data[data['direction']=='SE']
    plt.pie(Data[feature].value_counts()[:Limit],labels=list(
        Data[feature].value_counts()[:Limit].index),
        autopct ='%1.2f%%' , labeldistance = 1.1,
            explode = [0.05 for i in range(len(Data[feature].value_counts()[:Limit]))] )
    plt.show() 

In [None]:
SelecteddirectionlPie("congestionLevel")

In [None]:
SelecteddirectionlPie("WeekDayCase")

In [None]:
SelecteddirectionlPie("x")

In [None]:
SelecteddirectionlPie("y")

# Data Preparing

In [None]:
data.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
directionEnc,WeekDayCaseEnc,DayPartEnc=LabelEncoder(),LabelEncoder(),LabelEncoder()

In [None]:
def ApplyEncoder(Encoder,OriginalColumn) : 
    global data
    Encoder.fit(data[OriginalColumn])
    return Encoder.transform(data[OriginalColumn])

In [None]:
data['directionEnc'] = ApplyEncoder(directionEnc,'direction')
data['WeekDayCaseEnc'] = ApplyEncoder(WeekDayCaseEnc,'WeekDayCase')
data['DayPartEnc'] = ApplyEncoder(DayPartEnc,'DayPart')
data.head()

In [None]:
data.columns

In [None]:
X = data[['x', 'y', 'Hour', 'Day',
       'Month', 'directionEnc',  'WeekDayCaseEnc', 'DayPartEnc']]

y = data['congestion']

In [None]:
X

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test, y_train,y_test = train_test_split(X,y,test_size = 0.2,shuffle = True , random_state = 44)

X_train.shape,X_test.shape,y_train.shape,y_test.shape

# Building The Model

In [None]:

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,median_absolute_error 

RandomForestRegressorModel = RandomForestRegressor(random_state=22 )
DecisionTreeRegressorModel = DecisionTreeRegressor()
KNeighborsRegressorModel = KNeighborsRegressor()    


Models = [RandomForestRegressorModel,DecisionTreeRegressorModel,KNeighborsRegressorModel]

In [None]:
for Model in Models : 
    print(f'for Model {str(Model).split("(")[0]}')
    Model.fit(X_train, y_train)
    print(f'Train Score is : {Model.score(X_train, y_train)}')
    print(f'Test Score is : {Model.score(X_test, y_test)}')
    y_pred = Model.predict(X_test)
    print(f'MAE value is  : {mean_absolute_error(y_test, y_pred)}')
    print(f'MSE value is  : {mean_squared_error(y_test, y_pred)}')
    print(f'MdSE value is  : {median_absolute_error(y_test, y_pred)}')
    print('=================================================')

In [None]:
from sklearn.model_selection import GridSearchCV

SelectedModel = RandomForestRegressor( random_state=22)
SelectedParameters = {'n_estimators':[100,200,500,1000],'max_depth':[2,4,8,10]}



GridSearchModel = GridSearchCV(SelectedModel,SelectedParameters,
                               cv = 2,return_train_score=True)
GridSearchModel.fit(X_train, y_train)
sorted(GridSearchModel.cv_results_.keys())
GridSearchResults = pd.DataFrame(GridSearchModel.cv_results_)[
    ['mean_test_score','std_test_score','params','rank_test_score','mean_fit_time']]

# Showing Results
print('All Results are :\n', GridSearchResults )
print('Best Score is :', GridSearchModel.best_score_)
print('Best Parameters are :', GridSearchModel.best_params_)
print('Best Estimator is :', GridSearchModel.best_estimator_)

In [None]:
GridSearchModel.best_estimator_

# test data

In [None]:
#convert feature of time from object to datetime
test["time"]=pd.to_datetime(test["time"],format="%Y-%m-%d %H:%M")
#extract hour and day and month from time
test["Hour"]=test["time"].dt.hour
test["Day"]=test["time"].dt.day
test["WeekDay"]=test["time"].dt.day_name()
test["Month"]=test["time"].dt.month
test['WeekDayCase'] = test['WeekDay'].apply(lambda x : 'WeekEnd' if str(x).lower() in ['saturday','sunday'] else 'WeekDay')

test['DayPart'] = test['Hour'].apply(lambda x : DayPart(int(x)))
test.head()

In [None]:
def ApplyTestEncoder(Encoder,OriginalColumn) : 
    global test
    return Encoder.transform(test[OriginalColumn])
test['directionEnc'] = ApplyTestEncoder(directionEnc,'direction')
test['WeekDayCaseEnc'] = ApplyTestEncoder(WeekDayCaseEnc,'WeekDayCase')
test['DayPartEnc'] = ApplyTestEncoder(DayPartEnc,'DayPart')
test.head()


In [None]:
test.columns

In [None]:
X=test[['x', 'y','Hour', 'Day',
       'Month','directionEnc', 'WeekDayCaseEnc',
       'DayPartEnc']]

In [None]:
Results = GridSearchModel.best_estimator_.predict(X)

In [None]:
Test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv')
Test.head()

# Submission File


In [None]:
#Predicted of congestion
Test["congestion"] = Results
congestion_target=Test[["row_id","congestion"]]

In [None]:
congestion_target

In [None]:
congestion_target.to_csv('Predicted_of_congestion.csv')