# **4-qadam Machine Learning**

In [1]:
import pandas as pd
import numpy as np
import sklearn as sk

In [2]:
url = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df = pd.read_csv(url)

In [3]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [4]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=50)

X_train = train_set.drop("median_house_value", axis=1)
y = train_set["median_house_value"].copy()

X_num = X_train.drop("ocean_proximity", axis=1)

## Pipeline quramiz

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

# bizga kerak ustunlar indexlari
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesadder(BaseEstimator, TransformerMixin):
  def __init__(self, add_bedrooms_per_room=True):
    self.add_bedrooms_per_room = add_bedrooms_per_room
  def fit(self, X, y=None):
    return self # bizning funksiyamiz faqat transformer. estimator emas
  def transform(self, X):
    rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
    population_per_household = X[:, population_ix] / X[:, households_ix]
    if self.add_bedrooms_per_room:  # add_bedrooms_per_room ustuni ixtiyoriy bo'ladi
      bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
      return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
    else:
      return np.c_[X, rooms_per_household, population_per_household]

## Sonli ustunlar uchun

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('atter_adder', CombinedAttributesadder(add_bedrooms_per_room= True)),
    ('std_scaler', StandardScaler())
])

## Matnli ustunlar uchun

In [8]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs)
])

In [9]:
X_prepared = full_pipeline.fit_transform(X_train)

In [10]:
X_prepared

array([[ 0.67744368, -0.73125093,  1.06514215, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.23079392, -1.36705675,  0.03230036, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.13607631, -1.13797965, -1.00054142, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.82201266, -0.79670153, -1.3183389 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.64753286, -0.86215213,  0.82679405, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.16100199, -0.4881487 , -1.95393385, ...,  0.        ,
         0.        ,  0.        ]])

In [11]:
X_train
# Bu yuqorida yaratgan median_house_value ustunini tashlab yaratgan ustunimiz

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
4450,-118.21,34.07,42.0,902.0,318.0,1312.0,323.0,1.9375,<1H OCEAN
14253,-117.10,32.71,29.0,3422.0,713.0,2775.0,644.0,1.7075,NEAR OCEAN
15338,-117.29,33.20,16.0,2150.0,461.0,1428.0,407.0,2.4754,NEAR OCEAN
18156,-122.06,37.36,35.0,2693.0,493.0,1343.0,455.0,6.0777,<1H OCEAN
13431,-117.42,34.10,18.0,3977.0,809.0,2231.0,742.0,4.1399,INLAND
...,...,...,...,...,...,...,...,...,...
14565,-117.20,32.84,32.0,2033.0,394.0,989.0,389.0,3.2583,NEAR OCEAN
15649,-122.41,37.79,52.0,2161.0,544.0,904.0,431.0,3.5066,NEAR BAY
10123,-117.92,33.93,12.0,4415.0,890.0,1532.0,854.0,3.7500,<1H OCEAN
5600,-118.27,33.79,39.0,1513.0,365.0,1227.0,354.0,3.3929,<1H OCEAN


## Ma'lumotlar ML uchun tayyor.

## **Machine Learning**

In [12]:
### Linear Regression

## Linear Regression - Chiziqli **regressiya**
## sklearn tarkibidagi LinearRegression klassidan yangi model yaratamiz.

In [13]:
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()

LR_model.fit(X_prepared, y)

# Ushbu kod parchasi Sklearn kutubxonasidan LinearRegression modelini import qiladi, uni LR_model
# o'zgaruvchisiga o'rnatadi va keyin X_prepared (tayyorlangan xususiyatlar) hamda y (maqsad qiymatlar)
# yordamida modelni o'qitadi. Ya'ni, model kiritilgan ma'lumotlar asosida natijalarni taxmin qilishni o'rganadi.

LinearRegression bu estimator. Estimatorlar ma'lumotlarni qabul qilib oladi va .fit() metodi yordamida ulardan basorat qilishni o'rganadi (machine learning)

TAMOM! Machine Learning tugadi! bor yo'g'i 3 qator kod bilan biz kompyuterga uylarni narxini bashorat qilishni o'rgatdik.

Modelni qanday qilib tekshirib ko'rishimiz mumkin? housing datasetdan biror qatorni modelga beramiz va chiqqan natijani bizdagi bor natija (label) bilan solishtiramiz.

In [14]:
test_data = X_train.sample(10)
test_data
# test_data ga X_train dan taxminiy 10 tasini saqlab chiqarayabmiz

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
14698,-117.09,32.79,31.0,2019.0,417.0,872.0,386.0,3.1964,NEAR OCEAN
17459,-119.88,34.4,25.0,2741.0,623.0,2272.0,624.0,2.2647,NEAR OCEAN
12538,-121.49,38.56,52.0,1844.0,392.0,667.0,353.0,3.0033,INLAND
15121,-116.94,32.85,31.0,1293.0,232.0,599.0,228.0,4.7578,<1H OCEAN
1432,-122.02,37.99,37.0,2247.0,416.0,1237.0,397.0,4.45,NEAR BAY
16759,-122.48,37.71,29.0,1048.0,150.0,455.0,152.0,6.1278,NEAR OCEAN
16814,-122.43,37.64,34.0,8400.0,1812.0,4101.0,1717.0,4.1033,NEAR OCEAN
10050,-120.99,39.18,23.0,2550.0,457.0,1016.0,405.0,3.6607,INLAND
3690,-118.36,34.22,37.0,1512.0,348.0,1545.0,351.0,3.7663,<1H OCEAN
2943,-118.97,35.36,31.0,1418.0,306.0,1219.0,312.0,1.5743,INLAND


In [15]:
test_label = y.loc[test_data.index]
test_label
# test_label ga yuqoridagi test_data ning index lari va median_house_value yani narxi ham saqlab chiqarildi

Unnamed: 0,median_house_value
14698,177700.0
17459,216700.0
12538,103500.0
15121,161000.0
1432,161900.0
16759,417600.0
16814,301000.0
10050,153000.0
3690,160300.0
2943,46700.0


In [16]:
test_data_prepared = full_pipeline.transform(test_data)
predicted_labels = LR_model.predict(test_data_prepared)
predicted_labels
# test_data ni pipeline.transform dan o'tkazildi va test_data_prepared ga saqlandi
# Hamda ML qismida o'tkazilib predicted_labels ga saqlandi

array([212545.65550047, 180019.29217613, 157840.26460487, 249634.44295871,
       241627.91089361, 325178.92260552, 318889.45712275, 113985.41055753,
       201818.87149886,  65333.38745765])

## Bu qiymatlarni yuqoridagi asl median_house_value qiymatlari bilan solishtirishimiz mumkin

## Tushunarliroq ko'rish uchun bashorat qilganimiz va asl median_house_value ustuni yani uni test lebel ga saqlagan qiymatlarimizni aniqlik darajasini solishtirib ko'rishimiz mumkin

In [17]:
pd.DataFrame({'Bashorat_qilindi': predicted_labels, 'Asl_qiymat': test_label})

# Buni shunchaki tekshirib ko'rdik bir nechta qatorlar bilan
# test_set qisimlarni ML ga hali kiritmaganligimiz sababli yuqori aniqlikda natija olaolmadik

Unnamed: 0,Bashorat_qilindi,Asl_qiymat
14698,212545.6555,177700.0
17459,180019.292176,216700.0
12538,157840.264605,103500.0
15121,249634.442959,161000.0
1432,241627.910894,161900.0
16759,325178.922606,417600.0
16814,318889.457123,301000.0
10050,113985.410558,153000.0
3690,201818.871499,160300.0
2943,65333.387458,46700.0


# **NEW SECTION**

# **5**-**QADAM**.

# **Modelni baholash**

In [18]:
test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
1945,-120.93,38.77,9.0,2229.0,355.0,788.0,341.0,5.5111,196300.0,INLAND
18006,-121.97,37.30,31.0,3340.0,735.0,1891.0,686.0,4.8542,275000.0,<1H OCEAN
13062,-121.32,38.57,15.0,3369.0,499.0,1733.0,470.0,5.3100,127500.0,INLAND
13396,-117.57,34.02,5.0,6933.0,1311.0,3845.0,1285.0,4.6727,158900.0,INLAND
9787,-120.79,36.06,29.0,1916.0,386.0,1019.0,314.0,2.4881,87500.0,<1H OCEAN
...,...,...,...,...,...,...,...,...,...,...
15490,-117.19,33.14,12.0,3652.0,923.0,1677.0,728.0,2.3267,92000.0,<1H OCEAN
4339,-118.29,34.08,34.0,479.0,182.0,557.0,170.0,1.5250,210000.0,<1H OCEAN
14248,-117.10,32.72,5.0,1615.0,387.0,1094.0,394.0,2.2024,137200.0,NEAR OCEAN
9060,-118.14,34.68,25.0,1703.0,342.0,775.0,309.0,4.5455,126500.0,INLAND


In [19]:
X_test = test_set.drop('median_house_value', axis=1)
X_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
1945,-120.93,38.77,9.0,2229.0,355.0,788.0,341.0,5.5111,INLAND
18006,-121.97,37.30,31.0,3340.0,735.0,1891.0,686.0,4.8542,<1H OCEAN
13062,-121.32,38.57,15.0,3369.0,499.0,1733.0,470.0,5.3100,INLAND
13396,-117.57,34.02,5.0,6933.0,1311.0,3845.0,1285.0,4.6727,INLAND
9787,-120.79,36.06,29.0,1916.0,386.0,1019.0,314.0,2.4881,<1H OCEAN
...,...,...,...,...,...,...,...,...,...
15490,-117.19,33.14,12.0,3652.0,923.0,1677.0,728.0,2.3267,<1H OCEAN
4339,-118.29,34.08,34.0,479.0,182.0,557.0,170.0,1.5250,<1H OCEAN
14248,-117.10,32.72,5.0,1615.0,387.0,1094.0,394.0,2.2024,NEAR OCEAN
9060,-118.14,34.68,25.0,1703.0,342.0,775.0,309.0,4.5455,INLAND


In [20]:
y_test = test_set['median_house_value'].copy()
y_test

Unnamed: 0,median_house_value
1945,196300.0
18006,275000.0
13062,127500.0
13396,158900.0
9787,87500.0
...,...
15490,92000.0
4339,210000.0
14248,137200.0
9060,126500.0


In [21]:
X_test_prepapred = full_pipeline.transform(X_test)

In [22]:
y_predicted = LR_model.predict(X_test_prepapred)

In [23]:
y_predicted

array([179014.44757971, 287723.51488917, 171954.17417619, ...,
       141884.44149597, 189010.11042589, 111732.33857138])

In [24]:
pd.DataFrame({'Bashorat_qilindi': y_test, 'Asl_qiymat': y_predicted})
# Barchasini ham tekshirib ko'rdik va solishtirib ko'ryabmiz

Unnamed: 0,Bashorat_qilindi,Asl_qiymat
1945,196300.0,179014.447580
18006,275000.0,287723.514889
13062,127500.0,171954.174176
13396,158900.0,177605.404386
9787,87500.0,179293.205189
...,...,...
15490,92000.0,168659.188960
4339,210000.0,143417.782092
14248,137200.0,141884.441496
9060,126500.0,189010.110426


## mean absolute error yani bu bilan o'rtacha qancha hato bo'layotganligini aniqlashimiz mumkin
## Avvalgi darslarda mean absolute error bilan tanishganmiz endi uni hisoblaymiz

In [25]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test, y_predicted)

print(f"Mean absolute error:,({mean_absolute_error(y_test, y_predicted)}) $ yani 49_392 dollar ekan o'rtacha xato")

Mean absolute error:,(49392.902362154084) $ yani 49_392 dollar ekan o'rtacha xato


## Endi o'rtacha kvadrat xatoligini tekshirib ko'ramiz

In [26]:
# Hamda buni oxirida kvadrat yani ildizdan ham chiqarib yuborishimiz kerak
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_predicted)

print("RMSE=", np.sqrt(mse))

# Bunda o'rtacha kvadrat xatoligi 67_880 $ ni ko'rsatmoqda

RMSE= 67880.28851569007


Model aniqligini oshirish uchun yagona, universal yechim yo'q. Qilib ko'rishingiz mumkin bo'lgan ishlar:

Yaxhsiroq paramterlar topish
Yaxhsiroq model (algoritm) tanlash
Ko'proq ma'lumot yig'ish va hokazo.
Biz hozir boshqa model bilan sinab ko'ramiz.

## from sklearn.linear_model import LinearRegression

## LR_model = LinearRegression()

## Ko'rib turganimizdek LinearRegression dan yuqori darajada aniqlik chiqmadi shuning uchun pastda boshqa algoritmdan ham foydalanib ko'ramiz

## Shuningdek ML da boshqa algoritmlar ham ko'p hisoblanadi

# **NEW SECTION**

# **Modelni baholash.**

# **Random Forest.**

In [27]:
from sklearn.ensemble import RandomForestRegressor

RF_model = RandomForestRegressor()
RF_model.fit(X_prepared, y)

In [28]:
y_predicted = RF_model.predict(X_test_prepapred)

In [29]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_predicted)

print("RMSE=", np.sqrt(mse))

RMSE= 50583.029689329655


## Demak ko'rishimiz mumkin yuqorida LinearRegression da mean_squared_error bu 67_880 $ edi.

## Bunda RandomForestRegressor da  mean_squared_error ni hisoblaganimizda 50_281 $ xatolik chiqdi

## Yani nisbatan xatolar kamroq

## **DecisionTree**

In [59]:
from sklearn.tree import DecisionTreeRegressor
Tree_model = DecisionTreeRegressor()
Tree_model.fit(X_prepared, train_set["median_house_value"])

In [61]:
y_predicted = Tree_model.predict(X_test_prepapred)

In [62]:
lin_mse = mean_squared_error(y_test, y_predicted)
# RMSE hisoblaymiz
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

73564.30408465305


# **NEW SECTION**

# **Modelni baholash.**

# **Cross-validation**

In [36]:
x = df.drop('median_house_value', axis=1)
y = df['median_house_value'].copy()
# Bularni yana qayatdan x va y ga saqlab oldik

x_prepared = full_pipeline.fit_transform(x)

In [37]:
from sklearn.model_selection import cross_val_score

mse_scores = cross_val_score(LR_model, x_prepared, y, scoring='neg_mean_squared_error', cv=10)
mse_scores
# Ushbu kod cross_val_score funksiyasidan foydalanib, LinearRegression modelini (LR_model) x_prepared xususiyatlari
# va y maqsad qiymatlari yordamida baholaydi. Bu jarayon 10 ta turli bo'linmada amalga oshiriladi (cv=10), modelning
# har bir bo'linmada o'rtacha kvadratik xatoligini (neg_mean_squared_error) hisoblaydi. Natijada,
# modelning umumiy ishlashini baholash uchun bir qator qiymatlar qaytariladi.

array([-7.08688912e+09, -3.74440316e+09, -7.52445381e+09, -3.87963729e+09,
       -6.48624992e+09, -4.74977158e+09, -2.75676075e+09, -8.26357866e+09,
       -6.03341946e+09, -2.90962649e+09])

In [38]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Std.dev:", scores.std())

# display_scores funksiyasi baholash natijalari (masalan, cross_val_score dan olingan
# mse_scores kabi ballar ro'yxati) berilganida, ularni tushunarli tarzda chiqarish uchun ishlatiladi.

# print("Scores:", scores): Berilgan barcha individual ballarni chop etadi.
# print("Mean:", scores.mean()): Barcha ballarning o'rtachasini hisoblab, chop etadi. Bu modelning o'rtacha ish faoliyatini ko'rsatadi.
# print("Std.dev:", scores.std()): Ballarning standart og'ishini hisoblab, chop etadi. Bu modelning
# baholash natijalarida qanchalik o'zgaruvchan ekanligini ko'rsatadi (ya'ni, natijalar bir-biriga qanchalik yaqin yoki uzoq).

In [39]:
display_scores(np.sqrt(-mse_scores))
# Ushbu kod display_scores(np.sqrt(-mse_scores)) cross_val_score orqali hisoblangan o'rtacha kvadratik xatolar
# (MSE) asosida modelning RMSE (Root Mean Squared Error) qiymatlarini baholaydi va natijalarni tushunarli formatda ko'rsatadi.

# Umuman olganda, bu kod sizning Linear Regression modelingizning har bir cross-validation
# bo'linmasidagi o'rtacha bashorat xatosini (RMSE) hisoblaydi va bu xatolarning umumiy statistikasini ko'rsatadi.

# Umuman olganda Scores da 10 marta qayta hisoblab mean std qiymatlarini chiqarib beradi o'rtacha 71_888 $ ekan xatolik

Scores: [84183.66301514 61191.52853899 86743.60959739 62286.73445075
 80537.25795828 68918.58661112 52504.86407192 90904.22793667
 77675.08903006 53940.95369716]
Mean: 71888.65149074615
Std.dev: 13247.671855830775


In [40]:
scores = cross_val_score(RF_model, x_prepared, y, scoring='neg_mean_squared_error', cv=10)
display_scores(np.sqrt(-scores))
display_scores(np.sqrt(-mse_scores))

# Bunda scoring='neg_mean_squared_error' bilan hisoblab o'rtacha 63_745 xatosini chiqardi

Scores: [97942.59343916 47822.9819047  65482.48351768 56669.72410989
 60917.58846346 59718.15348909 46315.90167093 78684.69241515
 74681.94253856 49925.8348329 ]
Mean: 63816.18963815117
Std.dev: 15289.792086085035
Scores: [84183.66301514 61191.52853899 86743.60959739 62286.73445075
 80537.25795828 68918.58661112 52504.86407192 90904.22793667
 77675.08903006 53940.95369716]
Mean: 71888.65149074615
Std.dev: 13247.671855830775


# **NEW SECTION**

# **6**-**QADAM**

# **TAQDIMOT**

# **Taqdimot.**
# **Modelni saqlab olish**

## pickle

In [45]:
import pickle

filname = 'LR_model.pkl'  # Faylga hohlagancha nom berish mumkin
with open(filname, 'wb') as file:
  pickle.dump(LR_model, file)

# Buni pickle usulida saqladik

In [46]:
with open(filname, 'rb') as file:
  LR_model_loaded = pickle.load(file)

# O'qilmoqda

In [47]:
LR_model_loaded

## joblib

In [50]:
import joblib

filename = 'LR_model.pkl' # Faylga boshqacha nom berish ham mumkin
joblib.dump(LR_model, filename)

# Bu

['LR_model.pkl']

In [52]:
model = joblib.load(filename)

In [54]:
scores = cross_val_score(model, x_prepared, y, scoring='neg_mean_squared_error', cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [84183.66301514 61191.52853899 86743.60959739 62286.73445075
 80537.25795828 68918.58661112 52504.86407192 90904.22793667
 77675.08903006 53940.95369716]
Mean: 71888.65149074615
Std.dev: 13247.671855830775


Bu kod joblib kutubxonasidan foydalanib mashina o'rganish modelini (LR_model) faylga saqlash va keyinchalik uni yuklash jarayonini amalga oshiradi:

import joblib: joblib kutubxonasini import qiladi. Bu kutubxona Python obyektlarini, ayniqsa NumPy massivlari kabi katta hajmdagi ma'lumotlarga ega bo'lgan ML modellarini samarali saqlash va yuklash uchun mo'ljallangan.
filename = 'LR_model.pkl': Saqlanadigan fayl nomi 'LR_model.pkl' qilib belgilanadi. '.pkl' kengaytmasi odatda 'pickle' formatidagi fayllar uchun ishlatiladi, joblib ham shundan foydalanadi.

joblib.dump(LR_model, filename): Bu qator LR_model nomli mashina o'rganish modelini belgilangan filename (ya'ni 'LR_model.pkl') fayliga saqlaydi. Bu modelni kelajakda qayta ishlatish uchun diskka yozish imkonini beradi, modelni qayta o'qitishga hojat qolmaydi.

model = joblib.load(filename): Bu qator esa avval saqlangan 'LR_model.pkl' faylidan modelni o'qiydi va uni model o'zgaruvchisiga yuklaydi. Endi model o'zgaruvchisidan foydalanib, yuklangan modelni bashorat qilish yoki boshqa vazifalar uchun ishlatish mumkin.


## Loyiha ishi tugadi