In [None]:

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
%matplotlib inline
import os
import numpy as np
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
from scipy import stats
from scipy.stats import norm, skew
from sklearn import preprocessing
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor, plot_importance
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from math import sqrt
import math as mh

# 1-1 Analyse exploratoire et visualisation

<h3> 1. Pretraitement des donnees</h3>

In [None]:
df =pd.read_csv("/kaggle/input/restaurant-revenue-prediction/train.csv.zip",sep=',')
df.shape

<p>L'ensemble de données est assez petit, il faut donc éviter les modèles complexes avec de nombreux paramètres. L'utilisation d'un modèle complexe pour cet ensemble de données entraînera un surajustement du modèle par rapport à l'ensemble de données. Des techniques de régularisation devront certainement être utilisées pour éviter la possibilité de surapprentissage.</p>

In [None]:
df_test=pd.read_csv("/kaggle/input/restaurant-revenue-prediction/test.csv.zip",sep=',')
df_test.shape

<p>on a donc une ensemble de test de   100 000 resturants </p>

In [None]:
numerical_features = df.select_dtypes([np.number]).columns.tolist()
categorical_features = df.select_dtypes(exclude = [np.number,np.datetime64]).columns.tolist()
print(categorical_features)
categorical_features=['City', 'City Group', 'Type']
print(categorical_features)
print(numerical_features)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.columns

<p>Il existe principalement 2 caractéristiques :
City Group
Type (Type de restaurant. FC : Food Court, IL : Inline, DT : Drive Thru, MB : Mobile)
Et 37 Variables numériques (discrètes)
P1 à P37 </p>

In [None]:
df.info()

In [None]:
df.describe

In [None]:
df_test.head()

In [None]:
df_test.info()

In [None]:
df_test.columns

<h4>a. Détection des valeurs manquantes</h4>

In [None]:
df2=df.copy()

In [None]:
# Vérifier les valeurs nulles.
df.isnull().sum().sum()

In [None]:
df.isnull().sum().sort_index()/len(df)

<h4>b- la transformation des données</h4>

In [None]:
fig, ax = plt.subplots(1,2, figsize=(19, 5))
g1 = sns.countplot(df['Type'],palette="Set2", ax=ax[0]);
g2 = sns.countplot(df_test['Type'],palette="Set2", ax=ax[1]);
fig.show()

In [None]:
(df['City'].nunique(), df_test['City'].nunique())

<p>Le type MB sera remplacé par le type DT dans l'ensemble de test car il n'est pas disponible dans notre ensemble d'entraînement. La fonctionnalité Ville est inutile car notre ensemble d'entraînement contient 34 villes uniques, mais l'ensemble de test contient 57 villes uniques.</p>

In [None]:
df_test.loc[df_test['Type']=='MB', 'Type'] = 'DT'

In [None]:
df[df['revenue'] > 10000000 ]

In [None]:
# Drop outliers
df = df[df['revenue'] < 10000000 ]
df.reset_index(drop=True).head()

<h3>2-</h3>

<h4>a-la ville comportant le plus grand nombre de restuarants</h4>



In [None]:
df["City"].value_counts()

<p>Istanbul a Nombre maximum de restaurants = 50</p>

In [None]:
plt.subplots(figsize=(30,10))
city_revenue_group = df["City"].value_counts()
x_axis = city_revenue_group.index
y_axis = city_revenue_group
plt.bar(x_axis,y_axis)
plt.xlabel("Ville")
plt.ylabel("Nombre De restaurants")
plt.show()

<h4>b- quelles sont les caracteristiques les plus correlees avec le cible</h4>

In [None]:
# La cible
y= df['revenue']

In [None]:
y.describe()

In [None]:
# les caracteristiques les plus correlees avec revenue
plt.figure(figsize=(10, 8))
sns.heatmap(df.drop(['revenue','City Group','Type'], axis=1).corr(), square=True)
plt.suptitle('Pearson Correlation Heatmap')
plt.show();

In [None]:
corr_with_revenue = df.drop(['City Group','Type'],axis=1).corr()['revenue'].sort_values(ascending=False)
plt.figure(figsize=(10,7))
corr_with_revenue.drop('revenue').plot.bar()
plt.show();

<p>p2 p28 p6 </p>

<h4>c-Quelle type de restaurant est le plus présent dans ce dataset ?</h4>

In [None]:
df["Type"].value_counts()

In [None]:
plt.subplots(figsize=(30,10))
res_type = df["Type"].value_counts()
x_axis = res_type.index
y_axis =res_type
plt.bar(x_axis,y_axis)
plt.xlabel("Type de Restaurant")
plt.ylabel("Nombre")
plt.show()

<h3>3- Kmeans</h3>

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn import cluster
relevant_pvars =  ["P1", "P2", "P11", "P19", "P20", "P23","P30"]
train = df.loc[:, relevant_pvars]
kmeans = cluster.KMeans(n_clusters=5)
kmeans.fit(train)  


<h3>4. DBSCAN</h3>

<h4>a. KNN</h4>

In [None]:
from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=3).fit(train)
distances, indices = nbrs.kneighbors(train)
distanceDec = sorted(distances[:,3-1], reverse=False)
plt.plot(indices[:,0], distanceDec)
plt.xlabel('Points sorted according to distance of 3th nearest neighbor')
plt.ylabel('3th Nearest Neighbor Distance')
plt.show()

In [None]:
plt.axhline(3)
plt.plot(indices[:,0], distanceDec)
plt.xlabel('Points sorted according to distance of 3th nearest neighbor')
plt.ylabel('3th Nearest Neighbor Distance')
plt.show()

In [None]:
from sklearn.cluster import DBSCAN
dbscan=DBSCAN(eps=3, min_samples=20)
dbscan.fit(train)

In [None]:
labels=dbscan.labels_

In [None]:
#points extrêmes dont labels == -1
df[labels==-1]

In [None]:
#Shape des données représentant des valeurs extremes
df[labels==-1].shape

<h3>5- t_SNE </h3>

In [None]:
from sklearn.manifold import TSNE

In [None]:
model = TSNE(learning_rate=10)
tsne_features = model.fit_transform(train)
xs = tsne_features[:,0]
ys = tsne_features[:,1]
plt.scatter(xs,ys, c=y)
plt.show()
plt.clf()

<h1>1.2 Features engineering</h1>

In [None]:
df_train=df.copy()
df_train=df_train.drop('revenue', axis=1)
df_full = pd.concat([df_train,df_test])
df_full = df_full.drop('City', axis=1)
p_name = ['P'+str(i) for i in range(1,38)]

In [None]:
from sklearn.decomposition import PCA
pca = PCA().fit(df_full[p_name])
pca_list = ['pca'+str(i) for i in range(1,30,1)]
df_full[pca_list] = PCA(n_components=29).fit_transform(df_full[p_name])
df_full.drop(p_name,axis=1,inplace=True)

In [None]:
df3=pd.get_dummies(df_full, dtype=float)

<h3>1-Visualisation de la correlation</h3>

In [None]:
corr_with_target = df.corr()['revenue'].sort_values(ascending=False)
plt.figure(figsize=(14,7))
corr_with_target.drop('revenue').plot.bar()
plt.show()

<h3>2- la matrice de corrélation</h3>

In [None]:
import seaborn as sns
str_list = [] # liste vide pour contenir les colonnes avec les mots 
for colname, colvalue in df.iteritems():
    if type(colvalue[1]) == str:
         str_list.append(colname)
            
num_list = df.columns.difference(str_list) 

df_num = df[num_list]
f, ax = plt.subplots(figsize=(30, 20))
plt.title('Pearson Correlation of features')
# dessiner heatmap en utilisant seaborn
sns.heatmap(df_num.astype(float).corr(),linewidths=0.25,vmax=1.0, square=True, cmap="cubehelix", linecolor='k', annot=True)

<h3>3 Utilisation des techniques indiquees</h3>

In [None]:
coor_pos= corr_with_target[corr_with_target>0]
coor_neg= corr_with_target[corr_with_target<0]

In [None]:
coor_pos

In [None]:
coor_neg

<h3> 4-la date d'ouverture affecte la prediction finale</h3>

In [None]:
# on va créer une nouvel attribut appelé "Age" qui signifie depuis combien de temps le restaurant est-il ouvert.
from datetime import date, datetime

def calculate_age(born):
        born = datetime.strptime(born, "%m/%d/%Y").date()
        today = date.today()
        return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

df['Age'] = df['Open Date'].apply(calculate_age)
df_test['Age'] = df_test['Open Date'].apply(calculate_age)


# Drop 'Id' column from Dataframes
df = df.drop('Id', axis=1)

df.head()

<h3>5-le type de restaurant a tendance à gérer plus de revenue </h3>

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(40, 30))
for variable, subplot in zip(categorical_features, ax.flatten()):
    df_2 = df[[variable,'revenue']].groupby(variable).revenue.sum().reset_index()
    df_2.columns = [variable,'total_revenue']
    sns.barplot(x=variable, y='total_revenue', data=df_2 , ax=subplot)
    subplot.set_xlabel(variable,fontsize=20)
    subplot.set_ylabel('Total Revenue',fontsize=20)
    for label in subplot.get_xticklabels():
        label.set_rotation(45)
        label.set_size(20)
    for label in subplot.get_yticklabels():
        label.set_size(20)
fig.tight_layout()

<p>Le type FC a tendance à gérer plus de revenue</p>

<h1>1.3 Apprentissage du modéle et régles des hyper-paramètres</h1>

<h3>1-la régression logistique</h3>

In [None]:
#on va passer vers des valeurs discrètes qui représentent Open Date City, City Group, Type
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df['Open Date'])
df['Open Date']=le.fit_transform(df['Open Date'])
le.fit(df['City'])
df['City']=le.fit_transform(df['City'])
le.fit(df['City Group'])
df['City Group']=le.fit_transform(df['City Group'])
le.fit(df['Type'])
df['Type']=le.fit_transform(df['Type'])

In [None]:
#Divison des données en train et test
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.33, random_state=324)
#liste pour stocker les rmse
rmsee=[]

In [None]:
from sklearn.linear_model import LogisticRegression
regressor = LogisticRegression()
regressor.fit(X_train, y_train)
y_prediction = regressor.predict(X_test)
RMSE_lr = sqrt(mean_squared_error(y_true = y_test, y_pred = y_prediction))
print(RMSE_lr)
rmsee.append(RMSE_lr)

<h3>2-voting </h3>

In [None]:
from sklearn.ensemble import VotingRegressor

<h3>3- random foreset </h3>

In [None]:
randomforest = RandomForestRegressor()
randomforest.fit(X_train, y_train)
y_prede = randomforest.predict(X_test)
RMSE_RF = sqrt(mean_squared_error(y_true = y_test, y_pred = y_prede))
print(RMSE_RF)
rmsee.append(RMSE_RF)


<h3> 4-AdaBoost</h3>

In [None]:
from sklearn.ensemble import AdaBoostRegressor
ada_reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=30), learning_rate=0.5, random_state=42)
ada_reg.fit(X_train, y_train)
y_pred = ada_reg.predict(X_test)
RMSE_AD = sqrt(mean_squared_error(y_true = y_test, y_pred = y_pred))
print(RMSE_AD)
rmsee.append(RMSE_AD)

<h3>5-XGBoost</h3>

In [None]:
import xgboost
xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_test)

RMSE_XG=sqrt(mean_squared_error(y_test, y_pred))

print(RMSE_XG)
rmsee.append(RMSE_XG)

In [None]:
table1 = {'RMSE':rmsee,'Algorithmes':['Logistic regression','random forest','AdaBoost',
                                               'XGBoost']}
df1 = pd.DataFrame.from_dict(table1, orient='index')
df1.transpose()

In [None]:
best_estimators=[]

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
## parameters
params = {
    "n_estimators": [10, 30, 50, 100],
    "learning_rate": [.01, 0.1, 0.5, 0.9, 0.95, 1],
    "random_state" : [42]
}

## XGBoost Regressor
AdaBoostR =   AdaBoostRegressor()
AdaBoostR_grid = GridSearchCV(AdaBoostR, params, scoring='r2', cv=5, n_jobs=-1)
AdaBoostR_grid.fit(X_train, y_train)

## Output
print("Best parameters:  {}:".format(AdaBoostR_grid.best_params_))
print("Best score: {}".format(AdaBoostR_grid.best_score_))
## Append to list
best_estimators.append(["AdaBoostR",AdaBoostR_grid.best_estimator_])

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

## pipeline
pipelines = []

for name,model in best_estimators:
    pipeline = Pipeline([("Scaler",StandardScaler()),
                            (name,model)
                        ])
    pipelines.append(["Scaled_"+name,pipeline])

In [None]:
numTrain=df.shape[0]

train = df3[:numTrain]
test = df3[numTrain:]


In [None]:
best_model = Pipeline([("Scaler",StandardScaler()),
                                      ("Votings",VotingRegressor([
                                                                  ("AdaBoostR", AdaBoostR_grid.best_estimator_)
                                                                 ]))])
## Fit the model 
best_model = best_model.fit(train,y) # fit the model with all the train datase

In [None]:
d=best_model.predict(test)

In [None]:
my_submission = pd.DataFrame({'Id': df_test.index, 'Prediction': d})
my_submission.to_csv('submission.csv', index=False)