In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os,gc
import warnings 
warnings.filterwarnings("ignore")

from math import sqrt

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import eli5
from eli5.sklearn import PermutationImportance

In [None]:
train = pd.read_csv('../input/restaurant-revenue-prediction/train.csv')
test = pd.read_csv('../input/restaurant-revenue-prediction/test.csv')
sample = pd.read_csv('../input/restaurant-revenue-prediction/sampleSubmission.csv')

In [None]:
print(train.shape, test.shape)

1. We have just 137 rows to train the model.
2. The test data is pretty huge compared to the train data.

## Preprocessing and EDA

In [None]:
train.head(n=10)

1. The target column is 'revenue'.
2. Dataset is anonymised.
3. This is a regression problem.

In [None]:
train.info()

Here,
1. There are no missing values.
2. We have 4 categorical columns.

In [None]:
train.describe()

In [None]:
plt.subplots(figsize=(6,6))
sns.distplot(train['revenue'], kde=True, bins=20)
plt.title('Number of Restaurants vs Revenue')
plt.xlabel('Revenue')
plt.ylabel('Number of Restaurants')

Most restaurant generate revenue between 0.25e7 to 0.5e7. Now, let's see how the city affects the restaurant's revenue,

In [None]:
train['City'].nunique()

In [None]:
plt.subplots(figsize=(8,4))
train['City'].value_counts().plot(kind='bar')
plt.title('No of restaurants vs City')
plt.xlabel('City')
plt.ylabel('No of restaurants')

1. Istanbul has the maximum number of restaurants.
2. Second is Ankara and then Izimir.
3. Rest of the cities has less than 10 restaurants.

In [None]:
train[['City','revenue']].groupby('City').mean().plot(kind='bar')
plt.title('Mean Revenue Generated vs City')
plt.xlabel('City')
plt.ylabel('Mean Revenue Generated')

Here, 
1. Mean Revenue Generated is over 5M for a few cities.
2. MRG is between 2M to 4M for most cities.
3. It is less than 2M for just 2 cities.

We can't use label encoding on this column,it will mislead the model.We can bin the cities based on Mean Revenue Generated.

In [None]:
mean_revenue_per_city = train[['City', 'revenue']].groupby('City', as_index=False).mean()
mean_revenue_per_city['revenue'] = mean_revenue_per_city['revenue'].apply(lambda x: int(x/1e6)) 
mean_revenue_per_city

mean_dict = dict(zip(mean_revenue_per_city.City, mean_revenue_per_city.revenue))
mean_dict

In [None]:
train.replace({"City":mean_dict}, inplace=True)
test.replace({"City":mean_dict}, inplace=True)

In [None]:
test['City'] = test['City'].apply(lambda x: 6 if isinstance(x,str) else x)

Now, let's see the 'City Group' column.

In [None]:
train['City Group'].unique()

In [None]:
sns.countplot(train['City Group'])
plt.ylabel('No. of Restaurants')
plt.title('No of Restaurants vs City Group')

The number of restaurants located in Big Cities is more.

In [None]:
train[['City Group', 'revenue']].groupby('City Group').mean().plot(kind='bar')
plt.ylabel('Mean Revenue Generated')
plt.title('Mean Revenue Generated vs City Group')

Mean revenue generated by restaurants in 'Big Cities' is close to 5M whereas in 'Other' cities it is close to 4M. We can use label encoding on this column.

In [None]:
lr = LabelEncoder()
train['City Group'] = lr.fit_transform(train['City Group'])
test['City Group'] = lr.transform(test['City Group'])

Now the 'Type' column.

In [None]:
train['Type'].unique()

In [None]:
sns.countplot(train['Type'])

Here,
1. We have three types of restaurants, but in the test set another type 'MB' is present. We'll have to fit the label encoder on the test data.

In [None]:
train[['Type', 'revenue']].groupby('Type').mean().plot(kind='bar')
plt.title('Mean Revenue per Type')

In [None]:
test['Type'] = lr.fit_transform(test['Type'])
train['Type'] = lr.transform(train['Type'])

Now, only 'Open Date' categorical column is left. We'll ignore it for now.

In [None]:
train.info()

In [None]:
train_correlations = train.drop(["revenue"], axis=1).corr()
train_correlations = train_correlations.values.flatten()
train_correlations = train_correlations[train_correlations != 1]

test_correlations = test.corr()
test_correlations = test_correlations.values.flatten()
test_correlations = test_correlations[test_correlations != 1]

plt.figure(figsize=(20,5))
sns.distplot(train_correlations, color="Red", label="train")
sns.distplot(test_correlations, color="Green", label="test")
plt.xlabel("Correlation values found in train (except 1)")
plt.ylabel("Density")
plt.title("Are there correlations between features?"); 
plt.legend();

Features in train dataset are highly correlated as compared to the test set. Let's create a baseline and check the most important features using Permutation Importance.

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(train.corr(), annot=True)

In [None]:
X = train.drop(['revenue', 'Id', 'Open Date'],axis=1)
y = train['revenue']

In [None]:
X.head()

In [None]:
model = LinearRegression(normalize=True)
model.fit(X,y)

In [None]:
perm = PermutationImportance(model, random_state=1).fit(X,y)
eli5.show_weights(perm, feature_names = X.columns.to_list())

The values towards the top are the most important features, and those towards the bottom matter least. P26, P9, P16, P36, P8, P18 and City are important features. Now, let's plot their graphs.

In [None]:
important_features = ['P26', 'P9', 'P16', 'P36', 'P8', 'P18']

f, axes = plt.subplots(3,2, figsize=(12,12), sharex=True)
f.suptitle('Distribution Plots of Important Features')

for ax,feature in zip(axes.flatten(), important_features):
    sns.distplot(X[feature], ax=ax)

In [None]:
sns.pairplot(train[important_features])

## Feature Engineering

Brute force feature engineering.

In [None]:
important_features

In [None]:
train['P26_to_City_mean'] = train.groupby('City')['P26'].transform('mean')
train['P9_to_City_mean'] = train.groupby('City')['P9'].transform('mean')
train['P16_to_City_mean'] = train.groupby('City')['P16'].transform('mean')
train['P36_to_City_mean'] = train.groupby('City')['P36'].transform('mean')
train['P8_to_City_mean'] = train.groupby('City')['P8'].transform('mean')
train['P18_to_City_mean'] = train.groupby('City')['P18'].transform('mean')

test['P26_to_City_mean'] = test.groupby('City')['P26'].transform('mean')
test['P9_to_City_mean'] = test.groupby('City')['P9'].transform('mean')
test['P16_to_City_mean'] = test.groupby('City')['P16'].transform('mean')
test['P36_to_City_mean'] = test.groupby('City')['P36'].transform('mean')
test['P8_to_City_mean'] = test.groupby('City')['P8'].transform('mean')
test['P18_to_City_mean'] = test.groupby('City')['P18'].transform('mean')

In [None]:
train['P26_to_City_group_mean'] = train.groupby('City Group')['P26'].transform('mean')
train['P9_to_City_group_mean'] = train.groupby('City Group')['P9'].transform('mean')
train['P16_to_City_group_mean'] = train.groupby('City Group')['P16'].transform('mean')
train['P36_to_City_group_mean'] = train.groupby('City Group')['P36'].transform('mean')
train['P8_to_City_group_mean'] = train.groupby('City Group')['P8'].transform('mean')
train['P18_to_City_group_mean'] = train.groupby('City Group')['P18'].transform('mean')

test['P26_to_City_group_mean'] = test.groupby('City Group')['P26'].transform('mean')
test['P9_to_City_group_mean'] = test.groupby('City Group')['P9'].transform('mean')
test['P16_to_City_group_mean'] = test.groupby('City Group')['P16'].transform('mean')
test['P36_to_City_group_mean'] = test.groupby('City Group')['P36'].transform('mean')
test['P8_to_City_group_mean'] = test.groupby('City Group')['P8'].transform('mean')
test['P18_to_City_group_mean'] = test.groupby('City Group')['P18'].transform('mean')

In [None]:
X = train.drop(['revenue', 'Id', 'Open Date'],axis=1)
y = train['revenue']

In [None]:
X.head()

## Baseline Submission

Using KFold cross-validation, because the size of training data is very small.

In [None]:
cv = KFold(n_splits=10, shuffle=True, random_state=108)
model = LGBMRegressor(n_estimators=200, learning_rate=0.01, subsample=0.7, colsample_bytree=0.8)

scores = []
for train_idx, test_idx in cv.split(X):
    X_train = X.iloc[train_idx]
    X_val = X.iloc[test_idx]
    y_train = y.iloc[train_idx]
    y_val = y.iloc[test_idx]
    
    model.fit(X_train,y_train)
    preds = model.predict(X_val)
    
    rmse = sqrt(mean_squared_error(y_val, preds))
    print(rmse)
    scores.append(rmse)

print("\nMean score %d"%np.mean(scores))

In [None]:
test.head()

In [None]:
predictions = model.predict(test.drop(['Id', 'Open Date'], axis=1))
sample['Prediction'] = predictions

In [None]:
sns.distplot(predictions, bins=20)

In [None]:
sample.to_csv('submission.csv', index=False)