In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# read the files
df_train = pd.read_csv('../input/mobile-price-classification/train.csv')
df_test = pd.read_csv('../input/mobile-price-classification/test.csv')

In [None]:
# Concatenate both files
df_tot = pd.concat([df_train, df_test], axis = 0)

In [None]:
# Analyze the data
df_tot.info()

In [None]:
# ID column is not required. Drop the column
df_tot.drop('id', axis=1, inplace=True)

In [None]:
# Check the skew of the target variable i.e price_range
print(df_tot['price_range'].skew())
sns.distplot(df_tot['price_range'], kde=True)

In [None]:
df_tot.columns

In [None]:
# Split into train and test data
train = df_tot.iloc[:2000, :]
test = df_tot.iloc[2000:,:]

In [None]:
# Split into X and Y
X_train = train.iloc[:,:20]
X_test = test.iloc[:,:20]
Y_train = pd.DataFrame(train.iloc[:, -1])
Y_test = pd.DataFrame(test.iloc[:, -1])

In [None]:
# Feature selection
from sklearn.feature_selection import RFECV
from xgboost import XGBRegressor
boost = XGBRegressor()
rfecv = RFECV(boost, cv = 3, n_jobs = -1)
rfecv = rfecv.fit(X_train, Y_train)
print(f"No. of highly important features: {rfecv.n_features_}")

In [None]:
imp_features = X_train.columns.values[rfecv.support_]
X_imp = X_train[imp_features]
X_test_imp = X_test[imp_features]
X_test_imp.head()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score as R2
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import RobustScaler

In [None]:
robust = RobustScaler()
X_imp_scaled = robust.fit_transform(X_imp)
random = RandomForestClassifier(max_depth=2, random_state=0)
random.fit(X_imp_scaled, Y_train)
res = random.predict(X_imp_scaled)
print(classification_report(Y_train, res))
print(confusion_matrix(Y_train, res))
# print('MAE ',MAE(Y_test, Y_pred))
# print('MSE ',MSE(Y_test, Y_pred))
# print('R2 ',R2(Y_test, Y_pred))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_imp_scaled, Y_train)
res = knn.predict(X_imp_scaled)
print(classification_report(Y_train, res))
print(confusion_matrix(Y_train, res))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_imp, Y_train)
res = knn.predict(X_imp)
print(classification_report(Y_train, res))
print(confusion_matrix(Y_train, res))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, Y_train)
res = knn.predict(X_train)
print(classification_report(Y_train, res))
print(confusion_matrix(Y_train, res))