In [41]:
import pandas as pd
import numpy as np

# Загрузка и подготовка данных

In [42]:
twitch_df = pd.read_csv('../data/twitch_data_preprocessed.csv')

In [43]:
twitch_df.head()

Unnamed: 0,Watch time(Minutes),Stream time(minutes),Peak viewers,Average viewers,Followers,Followers gained,Views gained
0,6196161750,215250,222720,27716,3246298,1734810,93036735
1,6091677300,211845,310998,25610,5310163,1370184,89705964
2,5644590915,515280,387315,10976,1767635,1023779,102611607
3,3970318140,517740,300575,7714,3944850,703986,106546942
4,3671000070,123660,285644,29602,8938903,2068424,78998587


In [44]:
y = twitch_df['Followers gained']
twitch_df = twitch_df.drop(['Followers gained'], axis=1)
X = twitch_df

In [45]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 101)

In [46]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [47]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy


# Basic Random Forest Regressor

In [48]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import r2_score

In [49]:
reg = RandomForestRegressor(random_state = 42)
reg.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10, random_state=42)

In [50]:
y_pred = reg.predict(X_test)
print(r2_score(y_test, y_pred))

0.5122128868258025


# Basic Random Forest Regressor Hyperparameter Tuning

In [51]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

In [52]:
base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

Model Performance
Average Error: 113104.9575 degrees.
Accuracy = 0.07%.


In [53]:
grid_search = GridSearchCV(estimator = reg, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [54]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

Fitting 3 folds for each of 288 candidates, totalling 864 fits


KeyboardInterrupt: 

# Gradient Boosting Regressor

In [None]:
reg = GradientBoostingRegressor()
reg.fit(X_train, y_train)

In [None]:
y_pred = reg.predict(X_test)
print(r2_score(y_test, y_pred))

# Gradient Boosting Regressor Hyperparameter Tuning

In [None]:
parameters = {
    "n_estimators":[5,50,250,500],
    "max_depth":[1,3,5,7,9],
    "learning_rate":[0.01,0.1,1,10,100]
}

In [None]:
grid_search = GridSearchCV(estimator = reg, param_grid = parameters, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

# Random Forest Stacking Regressor

In [None]:
estimators = [
     ('rf', RandomForestRegressor(n_estimators=10, random_state=42))]
reg = StackingRegressor(
     estimators=estimators, final_estimator=LogisticRegression())

In [None]:
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print(r2_score(y_test, y_pred))

# Random Forest Stacking Regressor Hyperparameter Tuning

In [None]:
grid_search = GridSearchCV(estimator = reg, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

# Random Forest Bagging Regressor

In [None]:
reg = BaggingRegressor(base_estimator=RandomForestRegressor(n_estimators=10, random_state=42), n_estimators=10, random_state=0)
reg.fit(X_train, y_train)

In [None]:
y_pred = reg.predict(X_test)
print(r2_score(y_test, y_pred))

# Random Forest Bagging Regressor Hyperparameter Tuning

In [None]:
grid_search = GridSearchCV(estimator = reg, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))