In [1]:
import numpy as np
import pandas as pd


#### Read dataset

In [2]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


#### Find initial correlation between features and target variable using Correlation Matrix

In [3]:
correlation_matrix = df.corr()
print(correlation_matrix['price_range'])

battery_power    0.200723
blue             0.020573
clock_speed     -0.006606
dual_sim         0.017444
fc               0.021998
four_g           0.014772
int_memory       0.044435
m_dep            0.000853
mobile_wt       -0.030302
n_cores          0.004399
pc               0.033599
px_height        0.148858
px_width         0.165818
ram              0.917046
sc_h             0.022986
sc_w             0.038711
talk_time        0.021859
three_g          0.023611
touch_screen    -0.030411
wifi             0.018785
price_range      1.000000
Name: price_range, dtype: float64


#### In the correlation matrix:

* Strongly correlated feature: ram (0.917) has a very high positive correlation with the target variable, indicating a strong relationship with the price range.
* Moderately correlated features: px_width (0.166), px_height (0.149), and battery_power (0.201) show moderate positive correlations, meaning they contribute to predicting the target but not as strongly as ram.
* Weakly correlated features: Features like clock_speed, mobile_wt, and touch_screen show very low or near-zero correlation, suggesting little or no relationship with the target.

#### Split train and test data

In [4]:
from sklearn.model_selection import train_test_split

X = df.iloc[:, :-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

#### Scale features as values are not in the same range

In [6]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [46]:
#col_headers = df.columns.tolist()
#col_headers

#### Prediction using Random Forest Classifier

In [7]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=50)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.89      0.92      0.91        92
           1       0.77      0.81      0.79        97
           2       0.84      0.78      0.81       109
           3       0.92      0.92      0.92       102

    accuracy                           0.86       400
   macro avg       0.86      0.86      0.86       400
weighted avg       0.86      0.86      0.86       400



In [8]:
print(model.feature_importances_)

[0.07605963 0.00595283 0.02980911 0.0075532  0.02583495 0.00640077
 0.03548693 0.02486058 0.03789844 0.02183654 0.03022802 0.059485
 0.05718593 0.47498735 0.02845015 0.02820803 0.03118065 0.00531919
 0.00675637 0.00650633]


#### The model.feature_importances_ array can be interpreted as follows:

**Top Feature:**
ram (0.4805) is the most significant feature, indicating it has a strong influence on predictions.


**Moderate Importance:**
battery_power (0.0742), px_height (0.0595), and four_g (0.0062) have moderate impacts on the model's performance.

**Low Importance:**
Features such as blue (0.0071) and dual_sim (0.0058) contribute minimally, suggesting they could be candidates for removal in feature selection processes.


#### Removing some low correlated features depending on Correlation Matrix & Feature Importance methods

In [9]:
df = df.drop(['blue', 'three_g', 'clock_speed'], axis=1) # Removing some low correlated features
df.head()

Unnamed: 0,battery_power,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,touch_screen,wifi,price_range
0,842,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,1,1
1,1021,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,0,2
2,563,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,0,2
3,615,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,0,0,2
4,1821,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,0,1


#### Split train and test data again

In [10]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

#### Scale features again

In [11]:
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

#### Prediction using Random Forest Classifier without low correlated features

In [12]:
model = RandomForestClassifier(n_estimators=50)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.94      0.92      0.93        92
           1       0.80      0.90      0.84        97
           2       0.85      0.81      0.83       109
           3       0.94      0.90      0.92       102

    accuracy                           0.88       400
   macro avg       0.88      0.88      0.88       400
weighted avg       0.88      0.88      0.88       400



#### Accuarcy increased from 86 to 89  after removing very low correlated features 

In [13]:
### Predict test.csv data using to developed model

In [15]:
df_test = pd.read_csv("test.csv")
X_df_test = df_test.drop(['id', 'blue', 'three_g', 'clock_speed'], axis=1) # Removing extra and some low correlated features
#X_df_test = df_test.iloc[:,:]

scaler = MinMaxScaler()
scaler.fit(X_df_test)

X_df_test = scaler.transform(X_df_test)

y_pred_test = model.predict(X_df_test)
print(y_pred_test)

[3 3 2 3 1 3 3 1 3 0 3 3 0 0 2 0 2 1 3 2 1 3 1 1 3 0 2 0 3 0 2 0 3 0 0 1 3
 1 2 1 1 2 0 0 0 1 0 3 1 2 2 0 2 0 3 1 3 1 1 3 3 2 0 1 1 1 1 3 1 2 1 2 2 3
 3 0 2 0 2 3 0 3 3 0 3 0 3 1 3 0 1 2 2 0 2 1 0 2 1 2 1 0 0 3 1 2 0 1 2 3 3
 3 1 3 3 3 3 1 3 0 0 3 2 1 1 0 3 2 3 1 0 2 1 1 3 1 1 0 3 2 1 3 1 2 2 3 3 2
 2 3 2 3 0 0 2 2 3 3 3 3 2 2 3 3 3 3 1 0 3 0 0 0 1 1 0 1 0 0 1 2 0 0 0 1 2
 2 2 1 0 0 0 0 0 3 1 0 2 2 2 3 1 2 2 3 3 1 2 1 0 0 1 3 0 3 3 3 0 2 0 3 2 3
 3 0 0 1 0 3 0 1 0 2 2 1 3 0 3 0 3 1 2 0 0 2 1 3 3 3 1 1 3 0 0 2 3 3 1 3 1
 0 3 2 1 2 3 3 3 1 0 1 2 3 1 1 3 2 0 3 0 1 2 0 0 3 3 3 3 2 1 3 3 2 3 2 2 1
 1 0 2 3 1 0 0 3 0 3 0 1 2 0 2 3 1 3 2 2 1 2 0 0 0 1 3 1 0 0 0 3 2 0 3 3 1
 2 2 2 3 1 3 3 2 2 3 3 3 1 3 0 3 1 3 1 2 3 0 1 1 3 1 3 2 3 0 0 0 0 2 0 0 2
 1 1 2 3 2 0 1 0 0 3 2 0 3 1 2 2 1 2 3 1 1 3 2 1 2 0 1 1 0 3 2 0 0 1 0 0 1
 1 0 0 0 2 2 3 2 3 0 2 1 3 0 1 1 1 1 0 3 2 3 3 1 3 1 3 1 3 2 1 2 2 1 1 0 0
 0 1 2 1 0 3 2 0 2 3 0 0 3 1 1 1 3 2 3 0 3 0 2 3 3 3 0 2 0 2 3 0 1 1 0 0 1
 1 2 3 3 3 2 3 1 2 2 2 3 