In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

# Loading datasets and quick look


phone_features = pd.read_csv("../input/mobile-price/train.csv", sep=';')
phone_features_test = pd.read_csv("../input/mobile-price/test.csv", sep=';')



In [None]:
phone_features.describe()


In [None]:
phone_features.info()
# as shown at the bottom, there are no null-values

In [None]:
# Now we can remove correlated features if they appear. Highly correlated features are limitations towards the proper model creation.
# If appear, then features can be inseparable so whole dependences could be corrupted. As we can see below there is no such problem - most features are not correlated.

In [None]:
correlation = phone_features.corr()

map = sns.heatmap(correlation, vmin=-1, vmax=1, center=0)
plt.show()

In [None]:
PF = phone_features.drop(columns='price_range')
PF_test = phone_features_test.drop(columns='id')

# Division between training data and testing data

X_train, X_test, y_train, y_test = train_test_split(PF, phone_features['price_range'], test_size=0.3, random_state=0)


In [None]:
#MODELS - comparison and selection

# 1: random forest
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
prediction1 = random_forest.predict(X_test)
print("Random forest:")
print(classification_report(y_test, prediction1))


In [None]:
# 2: logistic regression
lr = LogisticRegression(multi_class='ovr', solver='liblinear')
lr.fit(X_train, y_train)
prediction2 = lr.predict(X_test)
print("Logistic regression:")
print(classification_report(y_test, prediction2))

In [None]:
# 3: naive bayes
Naive_bayes = GaussianNB()
Naive_bayes.fit(X_train, y_train)
prediction3 = Naive_bayes.predict(X_test)
print("Naive Bayes:")
print(classification_report(y_test, prediction3))

In [None]:
# 4: support vector machine
Support_vector_machine = SVC()
Support_vector_machine.fit(X_train, y_train)
prediction4 = Support_vector_machine.predict(X_test)
print("Support vector machine:")
print(classification_report(y_test, prediction4))


In [None]:
# Choosing 'n' for nearest neighbors

f1_storage = []

for i in range(1, 10):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn = knn.fit(X_train, y_train)
    knn_predict = knn.predict(X_test)
    print(i, f1_score(y_test, knn_predict, average='weighted'))
    f1_storage.append(f1_score(y_test, knn_predict, average='weighted'))

numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9']

plt.bar(numbers, f1_storage)

plt.ylim(0.75, 1)
for i in range(len(f1_storage)):
    plt.text(i, f1_storage[i], str(round(f1_storage[i]*100, 3))+'%', size=8, ha='center', va='bottom')

plt.show()


In [None]:
# As we can see above the best 'n' value is 7..

In [None]:
# 5: KNN
knn = KNeighborsClassifier(n_neighbors=7)
knn = knn.fit(X_train, y_train)
prediction5 = knn.predict(X_test)
print("KNN:")
print(classification_report(y_test, prediction5))

In [None]:
Random_forest_results = f1_score(y_test, prediction1, average='weighted')
Logistic_regression_results = f1_score(y_test, prediction2, average='weighted')
Naive_Bayes_results = f1_score(y_test, prediction3, average='weighted')
Support_vector_machine_results = f1_score(y_test, prediction4, average='weighted')
KNN_results = f1_score(y_test, prediction5, average='weighted' )

models = [Random_forest_results, Logistic_regression_results, Naive_Bayes_results, Support_vector_machine_results, KNN_results]
model_names = ['RF', 'LR', 'NB', 'SVM', 'KNN']

plt.bar(model_names, models)
plt.ylim(0.5, 1)
for i in range(len(models)):
    plt.text(i, models[i], str(round(models[i]*100))+'%', size=8, ha='center', va='bottom')

In [None]:
# Finally, we can see that Support Vector Machine algorithm gives the best results and is the best model to predict price range.

In [None]:
# The predictions are as follows:
print(Support_vector_machine.predict(PF_test))