In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
train_data = pd.read_csv('/content/train_data.csv', index_col=0)
train_data.head()

In [None]:
train_data.shape

In [None]:
train_data.info()

In [None]:
train_data.isnull().sum()

In [None]:
train_data[train_data.duplicated()]

In [None]:
train_data.describe()

In [None]:
train_data.corr()

In [None]:
train_data.corrwith(train_data['price'])

In [None]:
train_data.columns

In [None]:
train_data['airline'].value_counts()

In [None]:
train_data['class'].value_counts()

In [None]:
train_data['stops'].value_counts()

In [None]:
plt.figure(figsize=(10,6))
airline = train_data['airline'].value_counts()
airline.plot(kind='bar', color='green')
plt.xlabel("Aviakompaniya")
plt.ylabel("Amalga oshirilgan parvozlar")
plt.show()

In [None]:
plt.figure(figsize=(10,6))
avg_price = train_data.groupby('airline')['price'].mean().reset_index()
avg_price = avg_price.sort_values(by='price',ascending=False)
sns.barplot(x='airline', y='price', data=avg_price)
plt.xlabel('Aviakompaniya')
plt.ylabel("O'rtacha narx")
plt.show()

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(train_data['duration'], train_data['price'], color='red')
plt.title("Parvoz davomiyligining chipta narxi bog'liqligi")
plt.xlabel("Parvoz davomiyligi")
plt.ylabel("Chipta narxi")
plt.show()

In [None]:
plt.figure(figsize=(8,8))
train_data['class'].value_counts().plot(kind='pie')
plt.title("Klasslar bo'yicha")
plt.show()

In [None]:
class_prices = train_data.groupby('class')['price'].mean()
sns.set_style("whitegrid")
class_prices.plot(kind='bar', color=['#4C72B0', '#55A868'])
plt.title("Average Ticket Price by Airplane Class")
plt.xlabel("Class")
plt.ylabel("Price)")
plt.show()

In [None]:
X_train = train_data.drop("price", axis=1)
y = train_data["price"].copy()

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(train_data, test_size=0.2, random_state=42)

X_train = train_set.drop("price", axis=1)
y = train_set["price"].copy()

In [None]:
X_train.head()

In [None]:
y.head()

In [None]:
category_columns = X_train.select_dtypes(include=['object']).columns.tolist()
number_columns = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()

In [None]:
category_columns

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler


num_pipeline = Pipeline([
          ('imputer', SimpleImputer(strategy='mean')),
          ('std_scaler', StandardScaler())             
])

cat_pipeline = Pipeline([
        ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
from sklearn.compose import ColumnTransformer

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, number_columns),
    ('cat', cat_pipeline, category_columns)
])

In [None]:
X_prepared = full_pipeline.fit_transform(X_train)

In [None]:
print(X_prepared.toarray())

In [None]:
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()

In [None]:
LR_model.fit(X_prepared, y)

In [None]:
X_test = test_set.drop('price', axis=1)

In [None]:
y_test = test_set['price'].copy()

In [None]:
X_test_prepared = full_pipeline.transform(X_test)

In [None]:
predicted_data = LR_model.predict(X_test_prepared)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
lin_mse = mean_squared_error(y_test, predicted_data)
# RMSE hisoblaymiz
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

In [None]:
lin_mae = mean_absolute_error(y_test, predicted_data)
print(lin_mae)

In [None]:
# DecisionTree
from sklearn.tree import DecisionTreeRegressor
Tree_model = DecisionTreeRegressor()
Tree_model.fit(X_prepared, y)

In [None]:
predicted_data = Tree_model.predict(X_test_prepared)

In [None]:
lin_mse = mean_squared_error(y_test, predicted_data)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

In [None]:
lin_mae = mean_absolute_error(y_test, predicted_data)
print(lin_mae)

In [None]:
# RandomForest
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()
RF_model.fit(X_prepared, y)

In [None]:
predicted_data = RF_model.predict(X_test_prepared)

In [None]:
lin_mse = mean_squared_error(y_test, predicted_data)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

In [None]:
lin_mae = mean_absolute_error(y_test, predicted_data)
print(lin_mae)

**Natijalar qo'ydagicha:**

*MODELLARIMIZ NATIJALARI*

LinearRegression -- RMSE = 6334.65, MAE = 4376.57

DecisionTree -- RMSE = 4407.16, MAE = 1767.81

RandomForest -- RMSE = 3379.6, MAE = 15.81.95

Eng yaxshi natija RandomForest da ko'zatildi, shuning uchun biz ushbu modelda bashorat qilamiz.

In [None]:
# Baholash
test_data_res = pd.read_csv('/content/test_data.csv', index_col=0)
test_data_res.head()

In [None]:
test_data_res.shape

In [None]:
test_data_res.info()

In [None]:
test_data_prepared = full_pipeline.transform(test_data_res)

In [None]:
print(test_data_prepared.toarray())

In [None]:
test_data_prepared.shape

In [None]:
finish_predicted_data = RF_model.predict(test_data_prepared)

In [None]:
df = pd.DataFrame({'Price': finish_predicted_data})
df.head()

In [None]:
df.to_csv('sample_solution.csv', index=False)

In [None]:
dft = pd.read_csv("/content/sample_solution.csv")
dft.head()