In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import OrdinalEncoder,StandardScaler,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor


In [None]:
# Ma'lumotlarni bilan tanishish

df = pd.read_csv('../input/aviachipta-narx/train_data.csv')
df.head(10)

In [None]:
# ustun qiymatlari haqida ma'lumot
df.describe()

In [None]:
# ustun korrelatsiya haqida ma'lumot
df.corr()

In [None]:
# id va flight ustunini tashlab yuboramiz
df.drop('id',axis=1, inplace=True)
df.drop('flight',axis=1,inplace=True)


In [None]:
# klass haqida
class_pie = df['class'].value_counts()
class_pie

In [None]:
# stoplar haqida
stops_pie = df['stops'].value_counts()
stops_pie

In [None]:
# Data Visualisation
# aviakompaniyalar haqida ma'lumot

airline_pie = df['airline'].value_counts()
myexplode = [0.07, 0.07, 0, 0,0,0]

plt.figure(figsize=(10,6))
plt.pie(airline_pie,labels=airline_pie.index , explode=myexplode,autopct='%.1f')
plt.show()

In [None]:
# Parvozlar soni bilan bo'gliqliklar

fig, ax = plt.subplots(3, 2, figsize=(20, 18))

sns.countplot(ax=ax[0, 0], data=df, x='source_city')
ax[0,0].set_title('Chiquvchi shahar ')
sns.color_palette("flare", as_cmap=True)

sns.countplot(ax=ax[0, 1], data=df, x='destination_city')
ax[0,1].set_title('Manzil')

sns.countplot(ax=ax[1, 0], data=df, x='departure_time')
ax[1,0].set_title("Qo'nish vaqti")

sns.countplot(ax=ax[1, 1], data=df, x='arrival_time')
ax[1,1].set_title('Yetib kelish vaqti')

sns.countplot(ax=ax[2, 0], data=df, x='stops')
ax[2,0].set_title("To'xtovlar")

sns.countplot(ax=ax[2, 1], data=df, x='class',hue='stops')
ax[2,1].set_title('Klass')

plt.show()

In [None]:
# Narx bilan bog'liqliklar

fig, ax = plt.subplots(2, 1, figsize=(10,10))
fig.suptitle("Narx bilan bog'liqliklar")
sns.scatterplot(ax=ax[0], data=df, x='duration',y='price')
sns.scatterplot(ax=ax[1], data=df, x="days_left",y='price', s=50)
plt.show()

Bundan ko'rinadiki parvoz davomiyligi va ungacha bo'lgan vaqt narxga ko'p ta'sir qilmaydi

In [None]:
# Machine Learning

# StratifiedShuffleSplit orqali muvozanatli ma'lumot olamiz

from sklearn.model_selection import StratifiedShuffleSplit
stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in stratified_split.split(df,df['class']):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

In [None]:
X_train = strat_train_set.drop('price',axis=1)
y_train = strat_train_set['price'].copy()
X_num = strat_train_set[['duration','days_left']]

In [None]:
# Pipeline quramiz

num_pipeline = Pipeline([
          ('std_scaler', StandardScaler())             
])

In [None]:
# Umumiy pipeline ni qurib olamiz

from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ['airline','source_city', 'departure_time', 'stops',
       'arrival_time', 'destination_city', 'class']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

In [None]:
# malu'motlarni pipeline dan o'tkazamiz

X_prepared = full_pipeline.fit_transform(X_train)
X_test = strat_test_set.drop('price', axis=1)
y_test = strat_test_set['price'].copy()

In [None]:
# Linear Regression modeli

LR_model = LinearRegression()

LR_model.fit(X_prepared, y_train)

X_test_prepared = full_pipeline.transform(X_test)
y_predicted = LR_model.predict(X_test_prepared)

mae = mean_absolute_error(y_test, y_predicted)
mse = mean_squared_error(y_test, y_predicted)

print("MAE=", np.around(mae))
print("RMSE=", np.around(np.sqrt(mse)))

In [None]:
# RandomForestRegressor modeli

RF_model = RandomForestRegressor()

RF_model.fit(X_prepared, y_train)

X_test_prepared = full_pipeline.transform(X_test)
y_predicted_rf = RF_model.predict(X_test_prepared)

mae = mean_absolute_error(y_test, y_predicted_rf)
mse = mean_squared_error(y_test, y_predicted_rf)

print("MAE=", np.around(mae))
print("RMSE=", np.around(np.sqrt(mse)))


In [None]:
predicted = pd.DataFrame({'Asl narxi':y_test,'Bashorat qilingan narx':y_predicted_rf})
predicted.head(10)

In [None]:
# DecisionTreeRegressor modeli

tree_model = DecisionTreeRegressor()

tree_model.fit(X_prepared, y_train)

X_test_prepared = full_pipeline.transform(X_test)
y_predicted = tree_model.predict(X_test_prepared)

mae = mean_absolute_error(y_test, y_predicted)
mse = mean_squared_error(y_test, y_predicted)

print("MAE=", np.around(mae))
print("RMSE=", np.around(np.sqrt(mse)))


In [None]:
# KNeighborsRegressor modeli

knn = KNeighborsRegressor(n_neighbors = 6)

knn.fit(X_prepared, y_train)

X_test_prepared = full_pipeline.transform(X_test)
y_predicted = knn.predict(X_test_prepared)

mae = mean_absolute_error(y_test, y_predicted)
mse = mean_squared_error(y_test, y_predicted)

print("MAE=", np.around(mae))
print("RMSE=", np.around(np.sqrt(mse)))


In [None]:
# # Eng yaxshi k ni topish

# from sklearn.model_selection import GridSearchCV
# param_grid = {'n_neighbors': np.arange(1, 25)}

# knn_gscv = GridSearchCV(knn, param_grid, cv=5)

# knn_gscv.fit(X_prepared, y_train)


In [None]:
# knn_gscv.best_params_

In [None]:
#XGBOX modeli

XGB_model = XGBRegressor()

XGB_model.fit(X_prepared, y_train)
y_predicted = XGB_model.predict(X_test_prepared)

mae = mean_absolute_error(y_test, y_predicted)
mse = mean_squared_error(y_test, y_predicted)

print("MAE=", np.around(mae))
print("RMSE=", np.around(np.sqrt(mse)))

Bundan ko'rinib turibdiku, Random Forest modeli eng kam xatolik bilan ishlayapti. Shuning uchun ushbu modelni tanlaymiz. 

In [None]:
# Submission

test_df = pd.read_csv('../input/aviachipta-narx/test_data.csv')
test_df.head(10)

In [None]:
test_df.drop(['id', 'flight'], axis=1, inplace=True)

In [None]:
test_df_prapared = full_pipeline.transform(test_df)

In [None]:
test_predicted = RF_model.predict(test_df_prapared)
test_predicted

In [None]:
sample_solution = pd.read_csv("../input/aviachipta-narx/sample_solution.csv")
sample_solution.head()

In [None]:
sample_solution['price'] = test_predicted
sample_solution.head(10)

In [None]:
# yakuniy jadvalni submission.csv sifatida saqlaymiz
sample_solution.to_csv('submission.csv')

In [None]:
submission_df = pd.read_csv('submission.csv')
submission_df.drop('Unnamed: 0',axis=1, inplace=True)

In [None]:
submission_df