### Import dependencies

In [None]:
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from category_encoders.target_encoder import TargetEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingRegressor
from scipy.stats import shapiro
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
import optuna
import seaborn as sns
import numpy as np
import torch

Read data from ```csv``` files

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

Now, let's train the model

### Linear regression

In [22]:
cols = ['tutor_rating', 'tutor_reviews', 'experience'] + [f'v_{i}' for i in range(25)] + [f'{i}_len' for i in range(9)]
x = train[cols]
y = train['mean_price']

lp = {'fit_intercept': [True, False], 'positive': [True, False]}
linear_grid_search_class = GridSearchCV(estimator=LinearRegression(), param_grid=lp)
linear_grid_search_class.fit(x, y)

In [23]:
output = pd.read_csv("sample_submit.csv")
output['mean_price'] = linear_grid_search_class.predict(test[cols])
output

Unnamed: 0,index,mean_price
0,0,16.093571
1,1,10.385500
2,2,19.250314
3,3,15.313132
4,4,11.159182
...,...,...
1511,1511,13.031270
1512,1512,17.281056
1513,1513,13.333523
1514,1514,12.788619


Save CSV result

In [24]:
output.to_csv('1.csv')

### Random forest

In [25]:
cols_copy = list(set(test.columns) - set(cols + ['categories', 'tutor_head_tags']))
x = train[cols + cols_copy]
y = train['mean_price']

target_encoder_class = TargetEncoder(cols=cols_copy)
x = target_encoder_class.fit_transform(x, y)

forest_regressor_class = RandomForestRegressor()
forest_regressor_class.fit(x, y)

cols2 = ['tutor_rating', 'tutor_reviews', 'experience'] + [f'v_{i}' for i in range(25)] + [f'{i}_len' for i in range(9)]
x = train[cols2]
y = train['mean_price']

pipeline = make_pipeline(GridSearchCV(estimator=forest_regressor_class,
                                      param_grid={'n_estimators': [1000], 'max_depth': [3, 5]},
                                      cv=4,
                                      refit=True))
pipeline.fit(x, y)

In [26]:
forest_test = target_encoder_class.transform(test[cols2 + cols_copy])
output['mean_price'] = forest_regressor_class.predict(forest_test)
output

Unnamed: 0,index,mean_price
0,0,16.666667
1,1,11.796204
2,2,23.056204
3,3,0.119880
4,4,16.666667
...,...,...
1511,1511,28.662130
1512,1512,9.548889
1513,1513,16.666667
1514,1514,10.861667


Save CSV result

In [27]:
output.to_csv('2.csv')

### Gradient boosting

In [28]:
grid = {'n_estimators': [100, 250, 500], 'learning_rate': [0.0015, 0.015, 0.15]}
boosting_grid_search_class = GridSearchCV(param_grid=grid,
                            estimator=GradientBoostingRegressor())
boosting_grid_search_class.fit(x, y)

In [30]:
output.to_csv('3.csv')