In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
train_data = pd.read_csv('/content/train_data.csv', index_col=0)
train_data.head()

In [None]:
test_data = pd.read_csv('/content/test_data.csv', index_col=0)
test_data.head()

In [None]:
train_data.shape, test_data.shape

In [None]:
train_data.info()

In [None]:
train_data.isnull().sum()

In [None]:
train_data[train_data.duplicated()]

In [None]:
train_data.describe()

In [None]:
train_data.corr()

In [None]:
train_data.corrwith(train_data['price'])

In [None]:
train_data.columns

In [None]:
train_data['airline'].value_counts()

In [None]:
train_data['class'].value_counts()

In [None]:
train_data['stops'].value_counts()

In [None]:
sns.barplot(x='airline',y='price', data=train_data.sort_values('price', ascending=False))
plt.show()

In [None]:
sns.barplot(x='destination_city',y='price', data=train_data.sort_values('price', ascending=False))
plt.show()

In [None]:
sns.scatterplot(data=train_data, x='duration', y='price')
plt.show()

In [None]:
f,ax = plt.subplots(figsize=(8,5))
ax = sns.boxplot(x='class',y='price',data=train_data)
plt.show()

In [None]:
sns.barplot(x='class',y='price', data=train_data.sort_values('price', ascending=False))
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(train_data, test_size=0.2, random_state=42)

X_train = train_set.drop("price", axis=1)
y_train = train_set["price"].copy()

X_test = test_set.drop("price", axis=1)
y_test = test_set["price"].copy()

In [None]:
category_columns = ['airline', 'flight', 'source_city', 'departure_time', 
                    'stops', 'arrival_time', 'destination_city', 'class']
number_columns = ['duration', 'days_left']

In [None]:
num_pipeline = Pipeline([
          ('imputer', SimpleImputer(strategy='mean')),
          ('std_scaler', StandardScaler())             
])

cat_pipeline = Pipeline([
        ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, number_columns),
    ('cat', cat_pipeline, category_columns)
])

In [None]:
X_pred = full_pipeline.fit_transform(X_train)

In [None]:
print(X_pred.toarray())

In [None]:
X_test_pred = full_pipeline.transform(X_test)

In [None]:
# LinearRegression
LR_model = LinearRegression()
LR_model.fit(X_pred, y_train)

lr_pred_data = LR_model.predict(X_test_pred)

In [None]:
# DecisionTree
Tree_model = DecisionTreeRegressor()
Tree_model.fit(X_pred, y_train)

tm_pred_data = Tree_model.predict(X_test_pred)

In [None]:
# RandomForest
RF_model = RandomForestRegressor(n_estimators=200)
RF_model.fit(X_pred, y_train)

rf_pred_data = RF_model.predict(X_test_pred)

In [None]:
# KNeighborsRegressor Model
KNN_model = KNeighborsRegressor(n_neighbors=200)
KNN_model.fit(X_pred, y_train)

knn_pred_data = KNN_model.predict(X_test_pred)

In [None]:
# LGBMRegressor Model
LGB_model = LGBMRegressor(n_estimators=200)
LGB_model.fit(X_pred, y_train)

lgb_pred_data = LGB_model.predict(X_test_pred)

In [None]:
# LinearSVR Model
SVR_model = LinearSVR()
SVR_model.fit(X_pred, y_train);

svr_pred_data = SVR_model.predict(X_test_pred)

In [None]:
# XGBRegressor Model
XGB_model = XGBRegressor(n_jobs=5, learning_rate=0.1, max_depth=10, random_state=1)
XGB_model.fit(X_pred, y_train)

xgb_pred_data = XGB_model.predict(X_test_pred)

In [None]:
# ExtraTreesRegressor Model
ET_model = ExtraTreesRegressor()
ET_model.fit(X_pred, y_train)

et_pred_data = ET_model.predict(X_test_pred)

In [None]:
# BaggingRegressor Model
BR_model = BaggingRegressor()
BR_model.fit(X_pred, y_train)

br_pred_data = BR_model.predict(X_test_pred)

In [None]:
# BaggingRegressor Model
GBR_model = GradientBoostingRegressor()
GBR_model.fit(X_pred, y_train)

gbr_pred_data = GBR_model.predict(X_test_pred)

In [None]:
# Tekshiramiz
# LinearRegression RMSE
lr_mse = mean_squared_error(y_test, lr_pred_data)
lr_rmse = np.sqrt(lr_mse)
print("LinearRegression Model RMSE=", lr_rmse)

# DecisionTreeRegressor RMSE
tm_mse = mean_squared_error(y_test, tm_pred_data)
tm_rmse = np.sqrt(tm_mse)
print("DecisionTreeRegressor Model RMSE=", tm_rmse)

# RandomForestRegressor RMSE
rf_mse = mean_squared_error(y_test, rf_pred_data)
rf_rmse = np.sqrt(rf_mse)
print("RandomForestRegressor Model RMSE=", rf_rmse)

# KNeighborsRegressor RMSE
knn_mse = mean_squared_error(y_test, knn_pred_data)
knn_rmse = np.sqrt(knn_mse)
print("KNeighborsRegressor Model RMSE=", knn_rmse)

# LGBMRegressor RMSE
lgb_mse = mean_squared_error(y_test, lgb_pred_data)
lgb_rmse = np.sqrt(lgb_mse)
print("LGBMRegressor Model RMSE=", lgb_rmse)

# LinearSVR RMSE
svr_mse = mean_squared_error(y_test, svr_pred_data)
svr_rmse = np.sqrt(svr_mse)
print("LinearSVR Model RMSE=", svr_rmse)

# XGBRegressor RMSE
xgb_mse = mean_squared_error(y_test, xgb_pred_data)
xgb_rmse = np.sqrt(xgb_mse)
print("XGBRegressor Model RMSE=", xgb_rmse)

# ExtraTreesRegressor RMSE
et_mse = mean_squared_error(y_test, et_pred_data)
et_rmse = np.sqrt(et_mse)
print("ExtraTreesRegressor Model RMSE=", et_rmse)

# BaggingRegressor RMSE
br_mse = mean_squared_error(y_test, br_pred_data)
br_rmse = np.sqrt(br_mse)
print("BaggingRegressor Model RMSE=", br_rmse)

# GradientBoostingRegressor RMSE
gbr_mse = mean_squared_error(y_test, gbr_pred_data)
gbr_rmse = np.sqrt(gbr_mse)
print("GradientBoostingRegressor Model RMSE=", gbr_rmse)

In [None]:
test_data_prepared = full_pipeline.transform(test_data)

In [None]:
print(test_data_prepared.toarray())

In [None]:
finish_predicted_data = RF_model.predict(test_data_prepared)

In [None]:
df_empty = pd.read_csv("/content/sample_solution.csv")

In [None]:
df = pd.DataFrame({'id': df_empty['id'], 'Price': finish_predicted_data})
df.head()

In [None]:
df.to_csv('sample_solution.csv', index=False)

In [None]:
dft = pd.read_csv("/content/sample_solution.csv")
dft.head()