In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading the datsets

In [None]:
data = pd.read_csv("/kaggle/input/mall-customers/Mall_Customers.csv")
data

# Saving a copy of the datasets

In [None]:
data_copy = data.copy()

# EDA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.distplot(data['Age'])

In [None]:
sns.distplot(data['Spending Score (1-100)'])

In [None]:
temp = data.groupby('Genre')['Spending Score (1-100)'].agg(['mean', 'min', 'max'])
temp = pd.DataFrame(temp)
temp

as we can see female spend more money than male

In [None]:
temp = data.groupby('Age')['Spending Score (1-100)'].agg(['mean', 'min', 'max', 'median'])
temp = pd.DataFrame(temp)
temp

people between the age of 20-40(estimated from data) spend more money

# Data Processing

**null values?**

In [None]:
data.isnull().sum()

dropping id column

In [None]:
data.drop('CustomerID', axis=1, inplace=True)

**creating new features**

1. **Feature binning**

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

In [None]:
age_binner = KBinsDiscretizer(n_bins=5, encode='ordinal')
income_binner = KBinsDiscretizer(n_bins=5, encode='ordinal')
score_binner = KBinsDiscretizer(n_bins=5, encode='ordinal')

In [None]:
data['Age_Group'] = age_binner.fit_transform(data['Age'].values.reshape(-1,1)).astype('int64')
data['Income_binned'] = income_binner.fit_transform(data['Annual Income (k$)'].values.reshape(-1,1)).astype('int64')
data['Spending_Score_Binned'] = score_binner.fit_transform(data['Spending Score (1-100)'].values.reshape(-1,1)).astype('int64')

**encoding categorical data to numeric data**

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

data['Genre'] = le.fit_transform(data['Genre'])

**scaling the data**

In [None]:
X, y = data.drop('Spending Score (1-100)', axis=1), data['Spending Score (1-100)']

In [None]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler

In [None]:
robust = RobustScaler()
minmax = MinMaxScaler()

for col in X.columns:
  X[col] = robust.fit_transform(X[col].values.reshape(-1,1))
  X[col] = minmax.fit_transform(X[col].values.reshape(-1,1))

In [None]:
y = robust.fit_transform(y.values.reshape(-1,1))
y = minmax.fit_transform(y.reshape(-1,1))

# Splitting data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=56)

# Splitting training data

In [None]:
x_train_, x_val, y_train_, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=56)

# Model Selection

In [None]:
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.linear_model import SGDRegressor
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
def model_selection(x_train_, x_val, y_train_, y_val, model):
  model = model()
  model.fit(x_train_, y_train_)

  pred = model.predict(x_val)

  acc = r2_score(y_val, pred)
  error = np.sqrt(mean_squared_error(y_val, pred))
  train_score = model.score(x_train_, y_train_)
  val_score = model.score(x_val, y_val)

  print('Acc :', acc*100)
  print('\n')
  print('Error:', error)
  print('\n')
  print('Train Score:', train_score*100)
  print('\n')
  print('Val Score:', val_score*100)
  print('\n')
  print('Is overfitting:', True if train_score>val_score else False)
  print('\n')
  print('Overfitting by:',train_score*100-val_score*100)

In [None]:
extratrees = model_selection(x_train_, x_val, y_train_, y_val, ExtraTreesRegressor)
extratrees

In [None]:
gradient = model_selection(x_train_, x_val, y_train_, y_val, GradientBoostingRegressor)
gradient

In [None]:
forest = model_selection(x_train_, x_val, y_train_, y_val, RandomForestRegressor)
forest

In [None]:
ada = model_selection(x_train_, x_val, y_train_, y_val, AdaBoostRegressor)
ada

In [None]:
xgb = model_selection(x_train_, x_val, y_train_, y_val, XGBRegressor)
xgb

In [None]:
tree = model_selection(x_train_, x_val, y_train_, y_val, DecisionTreeRegressor)
tree

In [None]:
extratree = model_selection(x_train_, x_val, y_train_, y_val, ExtraTreeRegressor)
extratree

In [None]:
catboost = model_selection(x_train_, x_val, y_train_, y_val, CatBoostRegressor)
catboost

In [None]:
sgd = model_selection(x_train_, x_val, y_train_, y_val, SGDRegressor)
sgd

In [None]:
neighbour = model_selection(x_train_, x_val, y_train_, y_val, KNeighborsRegressor)
neighbour

In [None]:
svr = model_selection(x_train_, x_val, y_train_, y_val, SVR)
svr

I will choose RandomForestRegressor

# Model Building and Training

In [None]:
model = RandomForestRegressor()
model.fit(x_train, y_train)

# Predictions

In [None]:
pred = model.predict(x_test)
pred

# Metric check

In [None]:
accuracy = r2_score(y_test, pred)
accuracy*100

In [None]:
error = np.sqrt(mean_squared_error(y_test, pred))
error

In [None]:
from sklearn.metrics import mean_squared_log_error

In [None]:
error_log = np.sqrt(mean_squared_log_error(y_test, pred))
error_log

In [None]:
overfitting_rate = model.score(x_train, y_train)*100 - model.score(x_test, y_test)*100
overfitting_rate

# Predicting with random cols

In [None]:
X.iloc[98:99]

In [None]:
pred_ = model.predict(X.iloc[98:99])
print('Original values:', y[98], 'Predicted value:', pred_)

In [None]:
X.iloc[199:200]

In [None]:
pred_ = model.predict(X.iloc[199:200])
print('Original values:', y[199], 'Predicted value:', pred_)

random value

In [None]:
model.predict([[0.0, 0.3269, 0.03, 0.5, 0.0, 4.0]])

In [None]:
pred = minmax.inverse_transform(np.array(0.97122449).reshape(1, -1))
pred_ = robust.inverse_transform(pred.reshape(1, -1))
pred_