## Aviachipta narxini bashorat qilish

### Ushbu modelda Hindiston Davlatidagi parvozlarning ma'lumotlari taqdim etilgan. Quyida bir nechta Regressiya algoritmlaridan foydalangan holatda ushbu modelni optimal (hozirchalik) ko'rinishga keltirildi.



> ##### Kerakli kutubxonalarni chaqirib olamiz



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor



> ##### Train hamda Yakuniy Test setlarni o'zgaruvchilarga saqlab olamiz  



In [3]:
raw_train = pd.read_csv("train_data.csv", index_col=0)
raw_train.head()

Unnamed: 0_level_0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Vistara,UK-810,Bangalore,Early_Morning,one,Night,Mumbai,Economy,14.25,21,7212
2,SpiceJet,SG-5094,Hyderabad,Evening,zero,Night,Kolkata,Economy,1.75,7,5292
3,Vistara,UK-846,Bangalore,Morning,one,Evening,Delhi,Business,9.58,5,60553
4,Vistara,UK-706,Kolkata,Morning,one,Evening,Hyderabad,Economy,6.75,28,5760
5,Indigo,6E-5394,Chennai,Early_Morning,zero,Morning,Mumbai,Economy,2.0,4,10712


In [4]:
raw_test = pd.read_csv("test_data.csv", index_col=0)
raw_test.head()

Unnamed: 0_level_0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,Air_India,AI-765,Kolkata,Evening,one,Night,Delhi,Business,28.25,2
2,Vistara,UK-747,Delhi,Early_Morning,one,Night,Mumbai,Business,13.83,34
3,Air_India,AI-570,Mumbai,Early_Morning,zero,Early_Morning,Chennai,Business,2.0,30
4,AirAsia,I5-974,Hyderabad,Night,one,Late_Night,Delhi,Economy,5.17,26
5,Air_India,AI-770,Kolkata,Night,one,Afternoon,Mumbai,Economy,16.33,35




> ##### Labelni ajratib olamiz



In [5]:
X_train = raw_train.drop('price', axis=1)
y_train = raw_train.price



> Ba'zi ustunlarni map() orqali qiymatlarini numeric ko'rinishga keltiramiz



In [6]:
X_train['stops'] = X_train['stops'].map({'zero' : 0, 'one' : 1, 'two_or_more' : 2})
X_train['class'] = X_train['class'].map({'Economy' : 0, 'Business' : 1})

raw_test['stops'] = raw_test['stops'].map({'zero' : 0, 'one' : 1, 'two_or_more' : 2})
raw_test['class'] = raw_test['class'].map({'Economy' : 0, 'Business' : 1})



> Qolgan ustunlarni pandasdagi get_dummies() metodi orqali unikal kategoriyalarni alohida ustunlarga ajratib olamiz



In [7]:
X_train_dummies = pd.get_dummies(X_train)
X_train_dummies.head()

Unnamed: 0_level_0,stops,class,duration,days_left,airline_AirAsia,airline_Air_India,airline_GO_FIRST,airline_Indigo,airline_SpiceJet,airline_Vistara,...,arrival_time_Evening,arrival_time_Late_Night,arrival_time_Morning,arrival_time_Night,destination_city_Bangalore,destination_city_Chennai,destination_city_Delhi,destination_city_Hyderabad,destination_city_Kolkata,destination_city_Mumbai
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,14.25,21,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1
2,0,0,1.75,7,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
3,1,1,9.58,5,0,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,0
4,1,0,6.75,28,0,0,0,0,0,1,...,1,0,0,0,0,0,0,1,0,0
5,0,0,2.0,4,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1


In [8]:
raw_test_dummies = pd.get_dummies(raw_test)
raw_test_dummies.head()

Unnamed: 0_level_0,stops,class,duration,days_left,airline_AirAsia,airline_Air_India,airline_GO_FIRST,airline_Indigo,airline_SpiceJet,airline_Vistara,...,arrival_time_Evening,arrival_time_Late_Night,arrival_time_Morning,arrival_time_Night,destination_city_Bangalore,destination_city_Chennai,destination_city_Delhi,destination_city_Hyderabad,destination_city_Kolkata,destination_city_Mumbai
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,1,28.25,2,0,1,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
2,1,1,13.83,34,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1
3,0,1,2.0,30,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,0,5.17,26,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
5,1,0,16.33,35,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1




> ##### Train datadan 'flight' ustunini tashlab yubormadim ushbu ustun qiymatlari yaratiladigan modelga yaxshigina ta'siri bor. Bu o'z navbatida test setda ushbu ustundagi barcha qiymatlarning uchramaslik muammosini keltirdi shuning uchun quyidagi funksiya orqali Train setga uxshagan Test setni hosil qildim  



In [10]:
test_new = pd.DataFrame(columns= X_train_dummies.columns, index=np.arange(1,5001))
for i in test_new.columns:
    if i in raw_test_dummies.columns:
        test_new[i] = raw_test_dummies[i]
    else:
        test_new[i] = 0
        
X_train_dummies.shape, test_new.shape

((20000, 1344), (5000, 1344))



> Standartlashni amalga oshirdim



In [11]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_dummies)
test_scaled = scaler.transform(test_new)



> ##### Turli algaritmlarda sinab ko'ramiz



In [12]:
# Decision Tree

# model_tree = DecisionTreeRegressor()
# cross_tree = cross_val_predict(model_tree, X = X_train_scaled, y=y_train, cv=5)
# 
# MAE : 1777.10
# MSE : 4378.54

In [13]:
# SVM
# 
# model_svm = SVR()
# cross_svm = cross_val_predict(model_svm, X= X_train_scaled, y=y_train, cv=5)
# 
# MAE : 16054.05
# MSE : 26222.01

In [14]:
# xgboost
# 
# model_xgb = XGBRegressor()
# cross_xgb = cross_val_predict(model_xgb, X= X_train_scaled, y=y_train, cv=5)
# 
# MAE : 2901.69
# MSE : 4854.99



> ##### Bir nechta model hamda Deep Learningning ba'zi Regressiyalaridan ham foydalanish natijasida hozirgacha eng yaxshi natijani RandomForestRegressor algaritmi taqdim etdi


In [15]:
model_random = RandomForestRegressor(n_estimators=100)
cross_tree = cross_val_predict(model_random, X = X_train_scaled, y=y_train, cv=5)

print(f'MAE : {mean_absolute_error(y_train, cross_tree):.2f}')
print(f'MSE : {mean_squared_error(y_train, cross_tree)**0.5:.2f}')

MAE : 1621.64
MSE : 3443.11


In [26]:
model_random_last = RandomForestRegressor(n_estimators=100)
model_random_last.fit(X_train_scaled, y_train)

pred_random_2 = model_random_last.predict(test_scaled)

In [27]:
sample_solution = pd.read_csv('/content/sample_solution.csv', index_col = 0)
sample_solution.head()

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
1,0
2,0
3,0
4,0
5,0


In [28]:
sample_solution.price = pred_random_2
sample_solution.head()

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
1,53832.2
2,63033.34
3,23393.82
4,2386.06
5,5545.51


In [29]:
sample_solution.to_csv('sample_solution_for_admit.csv')

In [None]:
from google.colab import drive
drive.mount('/content/drive')