In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score

In [2]:
# Train data
df = pd.read_csv(r"C:\Users\Saidabrorkhon\ML_Lectures\mobile_train.csv")
# Test data
df_test = pd.read_csv(r"C:\Users\Saidabrorkhon\ML_Lectures\mobile_test.csv")

Data Exploration

In [7]:
df.shape, df_test.shape

((2000, 21), (1000, 21))

In [11]:
df.head(2)

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2


In [12]:
df_test.head(2)

Unnamed: 0,id,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,...,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,1,1043,1,1.8,1,14,0,5,0.1,193,...,16,226,1412,3476,12,7,2,0,1,0
1,2,841,1,0.5,1,4,1,61,0.8,191,...,12,746,857,3895,6,0,7,1,0,0


In [3]:
# test data uchun 1000ta row ko'pga o'xshadi shu sabab, test datasetning yarmini train datasetga olishga qaror qildim.

# test datasetda price target value bo'lgani va u train datasetda bo'lmagani uchun faqat train datasetning columnlar borlarini test datasetdan oldim.

# intersection funksiyasi orqali, train dataset(df)ga test dataset(test_df)ning columnlarini common_cols variableiga assign qildim.

common_cols = df.columns.intersection(df_test.columns)
df = df[common_cols]
df_test = df_test[common_cols]

In [21]:
common_cols.shape

(20,)

In [4]:
# df_testdan sample() funksiyasi orqali datasetning yarmini (500) olsin va test_sample variablega assign qilsin.
test_sample = df_test.sample(n=500, random_state=42)

# test_sample ga tanlab olingan rowlarni ularning indexlari orqali df_testdan drop qilsin.
df_test = df_test.drop(test_sample.index)

# Datasetga dataset va o'sha ajratib olingan test_sampleni qo'shsin. ignore_index ajratib olingan rowlarning indexlarini o'chiradi va duplicate bo'lib qolishining oldini oladi.
df = pd.concat([df, test_sample], ignore_index=True)

In [23]:
df.shape

(2500, 20)

In [24]:
df.head(2)

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0


In [29]:
df.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
dtype: int64

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2500 non-null   int64  
 1   blue           2500 non-null   int64  
 2   clock_speed    2500 non-null   float64
 3   dual_sim       2500 non-null   int64  
 4   fc             2500 non-null   int64  
 5   four_g         2500 non-null   int64  
 6   int_memory     2500 non-null   int64  
 7   m_dep          2500 non-null   float64
 8   mobile_wt      2500 non-null   int64  
 9   n_cores        2500 non-null   int64  
 10  pc             2500 non-null   int64  
 11  px_height      2500 non-null   int64  
 12  px_width       2500 non-null   int64  
 13  ram            2500 non-null   int64  
 14  sc_h           2500 non-null   int64  
 15  sc_w           2500 non-null   int64  
 16  talk_time      2500 non-null   int64  
 17  three_g        2500 non-null   int64  
 18  touch_sc

In [None]:
num_col = df.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
df[num_col] = scaler.fit_transform(df[num_col])

In [6]:
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,-0.910794,-1.001601,0.828559,-1.035013,-0.760265,-1.034184,-1.394246,0.341162,1.361655,-1.082476,-1.303201,-1.413862,-1.137887,0.373619,-0.75222,0.305565,1.459942,-1.801245,-1.002403,0.982552
1,-0.502331,0.998401,-1.248948,0.966172,-0.98962,0.966946,1.140827,0.690714,-0.109878,-0.646626,-0.645151,0.60194,1.715859,0.44867,1.129976,-0.613427,-0.73623,0.555172,0.997603,-1.017758
2,-1.547448,0.998401,-1.248948,0.966172,-0.53091,0.966946,0.479503,1.389817,0.14481,0.225073,-0.645151,1.417372,1.085811,0.423043,-0.281671,-0.843175,-0.370201,0.555172,0.997603,-1.017758
3,-1.428788,0.998401,1.195178,-1.035013,-0.98962,-1.034184,-1.228915,1.040265,-0.251372,0.660922,-0.151615,1.310318,1.247956,0.574976,0.894702,0.535313,-0.004173,0.555172,-1.002403,-1.017758
4,1.3232,0.998401,-0.393504,-1.035013,1.991992,0.966946,0.644834,0.341162,0.031615,-1.082476,0.670947,1.292096,-0.08163,-0.667948,-0.987494,-0.843175,0.727885,0.555172,0.997603,-1.017758
