In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipcodes as zcode
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report , confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier


In [None]:
train_df = pd.read_csv("C:/data sets/mobile/train.csv")
test_df = pd.read_csv("C:/data sets/mobile/test.csv")
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe().T

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
pd.isnull(train_df).sum()

#### plot relation between price_range and features

In [None]:
fig, ax = plt.subplots(ncols=2, nrows=4, figsize=(12,28))
sns.boxplot(x='price_range',y='ram',data=train_df,ax=ax[0,0]);
sns.boxplot(x='price_range',y='battery_power',data=train_df,ax=ax[0,1]);
sns.boxplot(x='price_range',y='px_height',data=train_df,ax=ax[1,0]);
sns.boxplot(x='price_range',y='px_width',data=train_df,ax=ax[1,1]);
sns.boxplot(x='price_range',y='int_memory',data=train_df,ax=ax[2,0]);
sns.boxplot(x='price_range',y='wifi',data=train_df,ax=ax[2,1]);
sns.boxplot(x='price_range',y='mobile_wt',data=train_df,ax=ax[3,0]);
sns.boxplot(x='price_range',y='n_cores',data=train_df,ax=ax[3,1]);

#### Outlier Analysis of Non-Categorical Data:

In [None]:
fig, ax = plt.subplots(ncols=2, nrows=7, figsize=(12,28))
sns.boxplot(x=train_df['battery_power'],ax=ax[0,0])
sns.boxplot(x=train_df['clock_speed'],ax=ax[0,1])
sns.boxplot(x=train_df['fc'],ax=ax[1,0])
sns.boxplot(x=train_df['pc'],ax=ax[1,1])
sns.boxplot(x=train_df['px_width'],ax=ax[2,0])
sns.boxplot(x=train_df['sc_h'],ax=ax[2,1])
sns.boxplot(x=train_df['int_memory'],ax=ax[3,0])
sns.boxplot(x=train_df['m_dep'],ax=ax[3,1])
sns.boxplot(x=train_df['mobile_wt'],ax=ax[4,0])
sns.boxplot(x=train_df['n_cores'],ax=ax[4,1])
sns.boxplot(x=train_df['px_height'],ax=ax[5,0])
sns.boxplot(x=train_df['ram'],ax=ax[5,1])
sns.boxplot(x=train_df['sc_w'],ax=ax[6,0])
sns.boxplot(x=train_df['talk_time'],ax=ax[6,1])

In [None]:
plt.figure(figsize=(8,3))
sns.barplot(x='price_range',y='ram',data=train_df)
plt.show()

#### price_range vs Ram

In [None]:
sns.catplot(x="price_range",y="ram",data=train_df)
plt.xticks(rotation=90);

#### plotting Relation between Price_range and Battery Power

In [None]:
plt.figure(figsize=(8,3))
sns.barplot(x='price_range',y='battery_power',data=train_df)
plt.show()

#### plotting Relation between Price_range and Pixel Hight/Width

In [None]:
plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
sns.barplot(x='price_range',y='px_height',data=train_df,palette='Reds')   
plt.subplot(1,2,2)
sns.barplot(x='price_range',y='px_width',data=train_df,palette='Blues')
plt.show()


#### plotting Relation between Price_range and 3G/4G

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(train_df['three_g'],hue=train_df['price_range'], palette='pink')
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(train_df['four_g'],hue=train_df['price_range'], palette='ocean')
plt.show()

In [None]:
y = train_df['price_range']
x = train_df.drop('price_range', axis = 1)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,random_state=1)

In [None]:
y.unique()
# We have four price ranges as target values and will do multi-class classification in our study. 

In [None]:
labels = ["low cost", "medium cost", "high cost", "very high cost"]
values = train_df['price_range'].value_counts().values
colors = ['yellow','turquoise','lightblue', 'pink']
fig1, ax1 = plt.subplots()
ax1.pie(values, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=90)
plt.show()
#as we see in pie chart , dataset is balanced...

In [None]:
corr = train_df.corr()
corr = corr.price_range
cr = corr.sort_values(ascending = False)[1:]
sns.barplot(x=cr, y=cr.index,palette = "bright")
plt.title("Correlation between Attributes and Price Range")

In [None]:
fig = plt.subplots (figsize = (12, 12))
sns.heatmap(train_df.corr (), square = True, cbar = True, annot = True, cmap="GnBu", annot_kws = {'size': 8})
plt.title('Correlations between Attributes')
plt.show ()

##### We see from the heatmap;

### Decision Tree

In [None]:
dt_gini = DecisionTreeClassifier(random_state=101)
dt_model = dt_gini.fit(x_train, y_train)

In [None]:
y_pred_dt_gini = dt_gini.predict(x_test)

In [None]:
dt_model

In [None]:
print(metrics.confusion_matrix(y_test, y_pred_dt_gini))

In [None]:
print(metrics.classification_report(y_test, y_pred_dt_gini)) 

In [None]:
acc_dt_gini = metrics.accuracy_score(y_test, y_pred_dt_gini)
acc_dt_gini

In [None]:
#Visualizing the Confusion Matrix
cm = confusion_matrix(y_test, y_pred_dt_gini)
# label the confusion matrix  
conf_matrix=pd.DataFrame(data=cm,columns=["Low Cost", "Budgeted", "Medium Cost", "Flagship"],index=["Low Cost", "Budgeted", "Medium Cost", "Flagship"])
# plot a heatmap
fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")
plt.title("Confusion Matrix for Decison Tree (Gini)")
plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier
dte = DecisionTreeClassifier(criterion= "entropy",random_state=101)
dte.fit(x_train, y_train)
y_pred_dte = dte.predict(x_test)
#Visualizing the Confusion Matrix
cm = confusion_matrix(y_test, y_pred_dte)
# label the confusion matrix  
conf_matrix=pd.DataFrame(data=cm,columns=["Low Cost", "Budgeted", "Medium Cost", "Flagship"],index=["Low Cost", "Budgeted", "Medium Cost", "Flagship"])
# plot a heatmap
fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")
plt.title("Confusion Matrix for Decison Tree (Entropy)")
plt.show()
#Calculate Accuracy
acc_dte = accuracy_score(y_test, y_pred_dte)
#Accuracy
print(metrics.classification_report(y_test, y_pred_dte)) 

In [None]:
# ------------->RANDOM FOREST<------------
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 100, random_state=0,criterion = 'entropy',oob_score = True) 
rf.fit(x_train, y_train)
y_pred_rf = rf.predict(x_test)
#Visualizing the Confusion Matrix
cm = confusion_matrix(y_test, y_pred_rf)
# label the confusion matrix  
conf_matrix=pd.DataFrame(data=cm,columns=["Low Cost", "Budgeted", "Medium Cost", "Flagship"],index=["Low Cost", "Budgeted", "Medium Cost", "Flagship"])
# plot a heatmap
fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")
plt.title("Confusion Matrix for Random Forest")
plt.show()
#Calculate Accuracy
acc_rf = accuracy_score(y_test, y_pred_rf)
#Accuracy Report
print(metrics.classification_report(y_test, y_pred_rf)) 

In [None]:
train_df

In [None]:
#------------->SVM<-------------   
from sklearn.svm import SVC 
svm = SVC(kernel = 'linear',random_state = 0)  
svm.fit(x_train,y_train)                  
y_pred_svm = svm.predict(x_test)
#Visualizing the Confusion Matrix
cm = confusion_matrix(y_test, y_pred_svm)
# label the confusion matrix  
conf_matrix=pd.DataFrame(data=cm,columns=["Low Cost", "Budgeted", "Medium Cost", "Flagship"],index=["Low Cost", "Budgeted", "Medium Cost", "Flagship"])
# plot a heatmap
fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")
plt.title("Confusion Matrix for SVM")
plt.show()
#Calculate Accuracy
acc_svm = accuracy_score(y_test, y_pred_svm)  
#Accuracy Report
print(metrics.classification_report(y_test, y_pred_svm)) 

In [None]:
print("train accuracy:",svm.score(x_train,y_train))
print("test accuracy:",svm.score(x_test,y_test))

In [None]:
#------------>Accuracy Comparison<------------
models = ['dt_gini','DTE','RF','SVM']
acc_scores = [acc_dt_gini,acc_dte,acc_rf,acc_svm]
print("Models\tAccuracy\n")
for i in range(4):
    print(str(models[i]) + "\t" + str(acc_scores[i]),end='\n')

#### Detailed CLassification Report

In [None]:


# Confusion Matrix
print("\n------------>Classification Report for SVM<------------")
confusion = confusion_matrix(y_test, y_pred_svm)
# For class 0
TP_0 = confusion[0,0]
TN_0 = confusion[1,1]+confusion[2,2]+confusion[3,3]
FP_0 = confusion[1,0]+confusion[2,0]+confusion[3,0]
FN_0 = confusion[0,1]+confusion[0,2]+confusion[0,3]

precision_0 = TP_0 / ( TP_0 + FP_0)
recall_0 = TP_0 / ( TP_0 + FN_0)
specificity_0 = TN_0 / (TN_0 + FP_0)
accuracy_0 = (TP_0 + TN_0) / (TP_0 + TN_0 + FP_0 + FN_0)
fscore_0 = (2 * precision_0 * recall_0) / (precision_0 + recall_0)
# For class 1
TP_1 = confusion[1,1]
TN_1 = confusion[0,0]+confusion[2,2]+confusion[3,3]
FP_1 = confusion[0,1]+confusion[2,1]+confusion[3,1]
FN_1 = confusion[1,0]+confusion[1,2]+confusion[1,3]

precision_1 = TP_1 / ( TP_1 + FP_1)
recall_1 = TP_1 / ( TP_1 + FN_1)
specificity_1 = TN_1 / (TN_1 + FP_1)
accuracy_1 = (TP_1 + TN_1) / (TP_1 + TN_1 + FP_1 + FN_1)
fscore_1 = (2 * precision_1 * recall_1) / (precision_1 + recall_1)

# For class 2
TP_2 = confusion[2,2]
TN_2 = confusion[1,1]+confusion[0,0]+confusion[3,3]
FP_2 = confusion[1,2]+confusion[0,2]+confusion[3,2]
FN_2 = confusion[2,0]+confusion[2,1]+confusion[2,3]

precision_2 = TP_2 / ( TP_2 + FP_2)
recall_2 = TP_2 / ( TP_2 + FN_2)
specificity_2 = TN_2 / (TN_2 + FP_2)
accuracy_2 = (TP_2 + TN_2) / (TP_2 + TN_2 + FP_2 + FN_2)
fscore_2 = (2 * precision_2 * recall_2) / (precision_2 + recall_2)

# For class 3
TP_3 = confusion[3,3]
TN_3 = confusion[1,1]+confusion[2,2]+confusion[0,0]
FP_3 = confusion[0,3]+confusion[2,3]+confusion[1,3]
FN_3 = confusion[3,1]+confusion[3,2]+confusion[3,0]

precision_3 = TP_3 / ( TP_3 + FP_3)
recall_3 = TP_3 / ( TP_3 + FN_3)
specificity_3 = TN_3 / (TN_3 + FP_3)
accuracy_3 = (TP_3 + TN_3) / (TP_3 + TN_3 + FP_3 + FN_3)
fscore_3 = (2 * precision_3 * recall_3) / (precision_3 + recall_3)

print("-----------------------------------------------------------------")
print("\t " "  Class 0 \t Class 1 \t Class 2 \t Class 3")
print("-----------------------------------------------------------------")
print("Accuracy:" + "  " + str("%.4f" % accuracy_0) + " \t " + str("%.4f" % accuracy_1) +"\t"+" \t " + str("%.4f" % accuracy_2) +"\t"+ " \t " + str("%.4f" % accuracy_3) + "\n")
print("Precision:" + " " + str("%.4f" % precision_0) + " \t " + str("%.4f" % precision_1) +"\t"+" \t " + str("%.4f" % precision_2) +"\t"+ " \t " + str("%.4f" % precision_3) + "\n")
print("Recall:" + "    " + str("%.4f" % recall_0) + " \t " + str("%.4f" % recall_1) +"\t"+" \t " + str("%.4f" % recall_2) +"\t"+ " \t " + str("%.4f" % recall_3) + "\n")
print("Specificity:" +"" + str("%.4f" % specificity_0) + " \t " + str("%.4f" % specificity_1) +"\t"+" \t " + str("%.4f" % specificity_2) +"\t"+ " \t " + str("%.4f" % specificity_3) + "\n")
print("F-score:" + "   " + str("%.4f" % fscore_0) + " \t " + str("%.4f" % fscore_1) +"\t"+" \t " + str("%.4f" % fscore_2) +"\t"+ " \t " + str("%.4f" % fscore_3) )
print("-----------------------------------------------------------------")

#### Different Kernels:

In [None]:

linear = SVC(kernel='linear', C=1, decision_function_shape='ovo').fit(x_train, y_train)
rbf = SVC(kernel='rbf', gamma=1, C=1, decision_function_shape='ovo').fit(x_train, y_train)
poly = SVC(kernel='poly', degree=3, C=1, decision_function_shape='ovo').fit(x_train, y_train)
sig = SVC(kernel='sigmoid', C=1, decision_function_shape='ovo').fit(x_train, y_train)

In [None]:
accuracy_lin = linear.score(x_test, y_test)
accuracy_poly = poly.score(x_test, y_test)
accuracy_rbf = rbf.score(x_test, y_test)
accuracy_sig = sig.score(x_test, y_test)
print("Accuracy Linear Kernel:", accuracy_lin)
print("Accuracy Polynomial Kernel:", accuracy_poly)
print("Accuracy Radial Basis Kernel:", accuracy_rbf)
print("Accuracy Sigmoid Kernel:", accuracy_sig)

### Grid search cv on smv:

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param={'kernel':('linear','poly','rbf','sigmoid')
      , 'C':[1, 10]
      ,'gamma': ['auto','scale']
      ,'degree':[3,8]
      ,'coef0':[0.001,10,0.5]
      }

In [None]:
SVModel=SVC()
Grids=GridSearchCV(SVModel , param ,cv=5 )
Grids.fit(x_train,y_train)

In [None]:
Grids.best_params_

In [None]:
svm = SVC(kernel = 'linear',C= , degree= , coef0= ,Gamma=  ,random_state = 0)  
svm.fit(x_train,y_train)                  
y_pred_svm = svm.predict(x_test)
acc_svm = accuracy_score(y_test, y_pred_svm)

In [None]:
#Using the Best Algorithm
# Remove ID column(extra)
test_df.drop('id',axis=1, inplace=True)

In [None]:
test_df

In [None]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
test = sc.fit_transform(test_df)

In [None]:
test.shape

In [None]:
# Prediction
predicted_price_range = svm.predict(test)

test_df['predicted_price_range'] = predicted_price_range


In [None]:
test_df