<a href="https://colab.research.google.com/github/sanyamja1n/Mobile-Price-Range-Prediction/blob/main/Classification_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import Libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns                

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler


from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
# Load Dataset
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Sample Data/data_mobile_price_range.csv")

pd.pandas.set_option('display.max_columns',None)

In [None]:
# Dataset First Look
df.head()

In [None]:
# Dataset Rows & Columns count
df.shape

In [None]:
# Dataset Info
df.info()

In [None]:
# Dataset Duplicate Value Count
df.duplicated().sum()

In [None]:
# Missing Values/Null Values Count
df.isna().sum()

In [None]:
# Dataset Columns
df.columns

In [None]:
# Check Unique Values for each variable.
df.nunique()

In [None]:
# Dataset Describe
df.describe()

Here we can see that there are some entries in px_height and sc_w which are 0 and that is not possible so we will remove all these wrong data.



In [None]:
# Write your code to make your dataset analysis ready.
#Removing all the entries with screen width = 0
df = df[df['sc_w'] != 0]
#Removing all the entries with pixel height = 0
df = df[df['px_height'] != 0]
df.shape

In [None]:
sns.barplot(x=df['price_range'],y=df["ram"])
plt.title('Price range v/s Ram')

In [None]:
sns.pointplot(y="pc", x="price_range", data=df)

In [None]:

df.groupby(["price_range"])['n_cores'].value_counts().unstack().plot(kind = "bar",figsize=(20,8))
#plt.figure(figsize=(20,8))
plt.title('Price range v/s no. of cores')
plt.show()

In [None]:
sns.pointplot(y="battery_power", x="price_range", data=df)

In [None]:
sns.pointplot(y="int_memory", x="price_range", data=df)

In [None]:
# Chart - 6 visualization code
sns.pointplot(y="talk_time", x="price_range", data=df)

In [None]:
sns.pointplot(y="mobile_wt", x="price_range", data=df)

In [None]:
# Chart - 8 visualization code
sns.pointplot(y="px_width", x="price_range", data=df)
plt.show()
sns.pointplot(y="px_height", x="price_range", data=df)

In [None]:
# Chart - 9 visualization code
for i in ["blue",'dual_sim','four_g','n_cores','three_g','touch_screen','wifi','price_range']:
  df[i].value_counts().plot(kind = "pie",autopct='%1.1f%%')
  plt.show()

In [None]:
# Chart - 10 visualization code
plt.figure(figsize=(20, 15))
sns.heatmap(df.corr(), cmap='coolwarm', annot = True)

In [None]:
#Checking for outliers using boxplot
for i in list(df.columns):
  sns.boxplot(x=i,data=df)
  plt.show()

In [None]:
dfn = df.copy()

In [None]:
# Manipulate Features to minimize feature correlation and create new features
df['Screen Size']= round(np.sqrt(df['sc_h']**2+df['sc_w']**2),2)

df['Pixels']=df['px_height']*df['px_width']

In [None]:
# Select your features wisely to avoid overfitting
df.drop(columns=["sc_h",'sc_w','px_height','px_width'],axis=1,inplace=True)

In [None]:
X=df.drop(columns = ['price_range'])
y=df['price_range']

In [None]:
#train test split of data
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
def my_confusion_matrix(y_test, y_pred, plt_title):
    cm=confusion_matrix(y_test, y_pred)
    print(classification_report(y_test, y_pred))
    sns.heatmap(cm, annot=True, fmt='g', cbar=False, cmap='BuPu')
    plt.xlabel('Predicted Values')
    plt.ylabel('Actual Values')
    plt.title(plt_title)
    plt.show()
    return cm

In [None]:
rf=RandomForestClassifier()

rf.fit(X_train, y_train)
y_pred_rf=rf.predict(X_test)

print('Random Forest Classifier Test Accuracy Score: ',accuracy_score(y_test,y_pred_rf))
cm_rfc=my_confusion_matrix(y_test, y_pred_rf, 'Random Forest Confusion Matrix')

In [None]:
feature = df.drop(columns = ["price_range"]).columns
importances = rf.feature_importances_


importance_dict = {'Feature' : list(feature),
                   'Feature Importance' : importances}

importance_df = pd.DataFrame(importance_dict)


importance_df['Feature Importance'] = round(importance_df['Feature Importance'],2)
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
print(importance_df.sort_values(by=['Feature Importance'],ascending=False))

In [None]:
nb = GaussianNB()

nb.fit(X_train, y_train)
y_pred_nb=nb.predict(X_test)

print('Gaussian Naive Bayes Classifier Accuracy Score: ',accuracy_score(y_test,y_pred_nb))
cm_rfc=my_confusion_matrix(y_test, y_pred_nb, 'Gaussian Naive Bayes Confusion Matrix')

In [None]:
knn = KNeighborsClassifier()

knn.fit(X_train, y_train)
y_pred_knn=knn.predict(X_test)

print('KNN Classifier Accuracy Score: ',accuracy_score(y_test,y_pred_knn))
cm_rfc=my_confusion_matrix(y_test, y_pred_knn, 'KNN Confusion Matrix')

In [None]:
svmc = svm.SVC()

svmc.fit(X_train, y_train)
y_pred_svm=svmc.predict(X_test)

print('SVM Classifier Accuracy Score: ',accuracy_score(y_test,y_pred_svm))
cm_rfc=my_confusion_matrix(y_test, y_pred_svm, 'SVM Confusion Matrix')

In [None]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(fit_intercept=True, max_iter=10000)

LR.fit(X_train, y_train)
y_pred_LR=LR.predict(X_test)

print('Logistic Regression Accuracy Score: ',accuracy_score(y_test,y_pred_LR))
cm_rfc=my_confusion_matrix(y_test, y_pred_LR, 'LR Confusion Matrix')