# Prepare Workspace

#### Reference: "Approaching (almost) Any Machine Learning Problem", by Abhishek Thakur

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# for statistics
import statistics as st 
import scipy.stats as stats
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score




import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
train = pd.read_csv('/kaggle/input/mobile-price-classification/train.csv')

# Have a look of data

In [None]:
train.shape

In [None]:
train.info()

In [None]:
train.describe(include='all').T

# Handling Missing Values

In [None]:
# Check missing values both to numeric features and categorical features 
missing = train.isnull().sum()/train.shape[0]*100
missing

In [None]:
# Split data set between target variable and features
X_full = train.copy()
y = X_full.price_range
X_full.drop(['price_range'], axis=1, inplace=True)

# Target Variable Analysis

In [None]:
# Summarize the class distribution 
count = pd.crosstab(index = y, columns="count")
percentage = pd.crosstab(index = y, columns="frequency")/pd.crosstab(index = y, columns="frequency").sum()
pd.concat([count, percentage], axis=1)

In [None]:
# Plot the target variable
ax = sns.countplot(x=y, data=X_full).set_title("Target Variable Distribution")

# Zero/Close Zero Variance Predictors

In [None]:
# Find features with variance equal zero or lower than 0.05
to_drop = [col for col in X_full.columns if np.var(X_full[col]) < 0.05]
to_drop

# Correlated Predictors

In [None]:
# Correlation heatmap
corr_matrix = X_full.corr()

In [None]:
sns.set( rc = {'figure.figsize': (15, 15)})
sns.heatmap(corr_matrix, square = True, annot=True, fmt='.2f')
plt.title('Correlation Heatmap on data set',size=15)
plt.yticks(fontsize="13")
plt.xticks(fontsize="13")
plt.show()

In [None]:
# Select correlated features and removed it
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find index of feature columns with correlation greater than 0.75
to_drop = [column for column in upper.columns if any(upper[column].abs() > 0.75)]
to_drop

# Pre-processing

In [None]:
# Normalization of data
scaling = MinMaxScaler()
X_full_sc = scaling.fit_transform(X_full)

In [None]:
# Split data set into train and test
X_train, X_test, y_train, y_test = train_test_split(X_full_sc, y, train_size=0.8, 
                                                                random_state=0)

# Modeling

In [None]:
# Define model
rf_model = RandomForestClassifier(random_state=0)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
param_grid = {
    'n_estimators': range(100,501,100),
    'max_depth' : range(1,5,1)
}
my_model = GridSearchCV(rf_model,param_grid,cv=kf,verbose=10, n_jobs=-1)
my_model.fit(X_train,y_train)
print(f"Best score: {my_model.best_score_}")

In [None]:
print("Best parameters:")
best_parameters = my_model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print(f"\t {param_name}: {best_parameters[param_name]}")

# Prediction

In [None]:
predictions_tr = my_model.best_estimator_.predict(X_train) 
predictions_te = my_model.best_estimator_.predict(X_test)

In [None]:
accuracy_train = accuracy_score(y_train, predictions_tr) 
accuracy_test = accuracy_score(y_test, predictions_te) 

In [None]:
accuracy_train

In [None]:
accuracy_test

# Feature Importance

In [None]:
features = X_full.columns
importances = my_model.best_estimator_.feature_importances_
indices = np.argsort(importances)

plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()