##                              Mobile_Price_Classification

In [None]:
#importing data analysis tools
import pandas as pd
import numpy as np
#importing data visualization tools
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#importing train_data
#saving data into train_data
train_data=pd.read_csv('/kaggle/input/mobile-price-classification/train.csv')

In [None]:
#checking ia any null values in data
train_data.isna().sum()

In [None]:
#df shape
train_data.shape

In [None]:
#df columns
train_data.columns

In [None]:
#checking data frame information
train_data.info()

In [None]:
#printing different values in each column in train_data
for i in train_data.columns:
    print('column name:{}'.format(i))
    print(train_data[i].value_counts())

In [None]:
#correlation between sales_prices and remaining variables
fig,ax=plt.subplots(figsize=(10,6))
plt.title('Corelation of price_range',size=20,color='red')
train_data.corr()['price_range'].sort_values(ascending=False)[1:].plot(kind='barh')

In [None]:
#By observing the below plot  we can find 'ram' and 'price_range' variables are highly corelated
sns.jointplot(x='int_memory',y='ram',hue='price_range',data=train_data,kind='kde',palette='Set2')

In [None]:
plt.figure(figsize=(10,6))
plt.title('bateery_power Vs price_range')
sns.barplot(x='price_range',y='battery_power',data=train_data)

In [None]:
sns.catplot(x='price_range',y='ram',hue='touch_screen',col='dual_sim',row='wifi',data=train_data,kind='swarm')

In [None]:
sns.jointplot(x='px_height',y='px_width',hue='price_range',data=train_data,palette='coolwarm')

In [None]:
sns.histplot(x='price_range',hue='four_g',data=train_data,multiple='stack')

In [None]:
fig, ax = plt.subplots(figsize=(7, 5))
plt.title('price_range Vs three_g',color='red',size=18)
sns.violinplot(x='price_range',y='three_g',data=train_data)
ax.set_yticks([0,1])

In [None]:
#checking for outliers by using quantile
ram_max=train_data['ram'].quantile(.95)
ram_minimum=train_data['ram'].quantile(.05)

In [None]:
#checking outliers
train_data[(train_data['ram']>ram_max)&(train_data['price_range']<3)]
train_data[(train_data['ram']<ram_minimum)&(train_data['price_range']>1)]

### data preprocessing

In [None]:
#normalizing data
from sklearn.preprocessing import StandardScaler
#importing sklearn libraries for scoring models
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

In [None]:
scaler = StandardScaler()
#scaling data
train_scale_data=scaler.fit_transform(train_data.drop(['price_range'],axis=1))

In [None]:
#spliting data X_train,X_test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(train_scale_data,train_data['price_range'],random_state=101,test_size=0.33)

In [None]:
# X_train values after scaling
train_scale_data[:5]

In [None]:
from sklearn.linear_model import LogisticRegression
#  fitting data to logisticRegression model
lr=LogisticRegression(penalty='l2',solver='newton-cg',max_iter=400,multi_class='multinomial',C=0.5,random_state=101)
lr.fit(X_train,y_train)
pred1=lr.predict(X_test)
lr.score(X_train,y_train)

In [None]:
print(classification_report(y_test,pred1))

In [None]:
#creating dictionary
scores={}
scores["logistic"]=accuracy_score(y_test,pred1)

In [None]:
fig,ax=plt.subplots(figsize=(10,6))
sns.heatmap(confusion_matrix(y_test,pred1),annot=True,cmap='coolwarm',linewidths=1,cbar=False)
plt.xlabel('Prediction Price',size=14)
plt.ylabel('Actual Price',size=14);
plt.title('Confusion_matrix for Actual Prices Vs Prediction Prices',color='red')

In [None]:
#svm
from sklearn.svm import SVC
classifier=SVC(kernel='linear',random_state=101,decision_function_shape='ovo')
classifier.fit(X_train,y_train)
classifier.score(X_train,y_train)

In [None]:
pred2=classifier.predict(X_test)
scores["svc"]=accuracy_score(y_test,pred2)

In [None]:
print(classification_report(y_test,pred2))

In [None]:
fig,ax=plt.subplots(figsize=(10,6))
sns.heatmap(confusion_matrix(y_test,pred2),annot=True,cmap='coolwarm',linewidths=1,cbar=False)
plt.xlabel('Prediction Price',size=14)
plt.ylabel('Actual Price',size=14);
plt.title('Confusion_matrix for Actual Prices Vs Prediction Prices',color='red');

In [None]:
#RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
#importing RandomizedSearchCV for hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# hyperparameter tuning using RandomForestClassifier
rf_grid = {"n_estimators": np.arange(10,100, 10),
           "max_depth": [None,1],
           "min_samples_split": np.arange(2, 20, 2),
           "max_features": [0.5, 1, "sqrt", "auto"]}

rs_model = RandomizedSearchCV(RandomForestClassifier(), 
                              param_distributions=rf_grid,
                              n_iter=20,
                              cv=3,
                              verbose=True)

rs_model.fit(X_train, y_train)

In [None]:
# Find the best parameters from the RandomizedSearch 
rs_model.best_params_

In [None]:
final_model=RandomForestClassifier(n_estimators=80,
                                   min_samples_split=10,
                                   max_features=0.5,
                                   max_depth=None
                                   )
final_model.fit(X_train,y_train)
final_model.score(X_train,y_train)

In [None]:
pred3=final_model.predict(X_test)
scores["random_forest"]=accuracy_score(y_test,pred3)

In [None]:
fig,ax=plt.subplots(figsize=(10,6))
sns.heatmap(confusion_matrix(y_test,pred3),annot=True,cmap='coolwarm',linewidths=1,cbar=False)
plt.xlabel('Prediction Price',size=14)
plt.ylabel('Actual Price',size=14);
plt.title('Confusion_matrix for Actual Prices Vs Prediction Prices',color='red');

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_classifier=DecisionTreeClassifier(criterion='entropy')
dt_classifier.fit(X_train,y_train)
dt_classifier.score(X_train,y_train)

In [None]:
pred4=dt_classifier.predict(X_test)
scores["decision_tree"]=accuracy_score(y_test,pred4)

In [None]:
print(classification_report(y_test,pred4))

In [None]:
scores

In [None]:
#plotting all models accouarcy scores together
values = list(scores.values())
names = list(scores.keys())
plt.bar(range(len(scores)),values,tick_label=names);

### Test_data Preprocessing

In [None]:
#we got best scores for svc and logistic models
#using logistic model we calculate price_range of mobiles in test_data
#importing test_data
test_data=pd.read_csv('/kaggle/input/mobile-price-classification/test.csv')

In [None]:
test_data.columns

In [None]:
test_data.head(5).T

In [None]:
#scaling data
test_scale_data=scaler.fit_transform(test_data.drop('id',axis=1))
test_scale_data[:4]

In [None]:
#fitting logistic model to test_scale_data
price_ranges=lr.predict(test_scale_data)

In [None]:
price_ranges[:10]

In [None]:
#storing price_range values into test_data
test_data['price_range']=price_ranges

In [None]:
#test_data with predicted price_ranges
test_data.head(5).T