In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


In [1]:
df = pd.read_csv('/kaggle/input/aidl/train.csv')
df_test = pd.read_csv('/kaggle/input/aidl/test.csv')

# EDA (Copy from https://www.kaggle.com/aayushmishra1512/twitch-top-streamers-data-eda-linearreg)

In [1]:
df.head()

In [1]:
df.describe().T

In [1]:
plt.style.use('dark_background') #checking the stream times of top 50 streamers
plt.figure(figsize = (20,7))
df['Stream time(minutes)'].head(50).plot.bar(color = 'orangered')
plt.title('Comparing the different stream times (in minutes)')
plt.xlabel('Streamers')
plt.ylabel('Count')
plt.show()

In [1]:
plt.style.use('dark_background') #checking the followers gained by our Top 50 Streamers
plt.figure(figsize = (20,7))
df['Followers gained'].head(50).plot.bar(color = 'orangered')
plt.title('Comparing the followers gained by our Top 50 Streamers')
plt.xlabel('Streamers')
plt.ylabel('Count')
plt.show() 

In [1]:
plt.style.use('dark_background') #checking the views gained by our Top 50 Streamers
plt.figure(figsize = (20,7))
df['Views gained'].head(50).plot.bar(color = 'orangered')
plt.title('Comparing the views gained by our Top 50 Streamers')
plt.xlabel('Streamers')
plt.ylabel('Count')
plt.show()  

In [1]:
plt.style.use('dark_background') #checking the Average nmber of viewers of our Top 50 Streamers
plt.figure(figsize = (20,7))
df['Average viewers'].head(50).plot.bar(color = 'orangered')
plt.title('Comparing the average viewers of our Top 50 Streamers')
plt.xlabel('Streamers')
plt.ylabel('Count')
plt.show()  

In [1]:
plt.style.use('dark_background') #checking the streamers that stream in a perticular language
plt.figure(figsize = (20,7))
df['Language'].value_counts().head(20).plot.bar(color = 'orangered')
plt.title('Languages that Streamers stream in')
plt.xlabel('Languages')
plt.ylabel('Count')
plt.show()

In [1]:
df.dtypes

In [1]:
sns.countplot(x='Partnered',data = df) #checking how many are twitch partnered

In [1]:
df[df['Partnered'] == True][['Channel', 'Watch time(Minutes)', 'Stream time(minutes)', 'Followers']].head(10) #checking 10 streamers that are twitch partnered

In [1]:
sns.countplot(x='Mature',data = df) #checking how many streams are tagged as mature

In [1]:
plt.figure(figsize=(12,8))
sns.heatmap(df[['Channel', 'Watch time(Minutes)', 'Stream time(minutes)', 'Followers','Peak viewers','Average viewers','Followers gained','Views gained','Partnered','Mature','Language']].corr(), annot = True) #overall correlation between the various columns present in our data
plt.title('Overall relation between columns of the Dataset', fontsize = 20)
plt.show()

In [1]:
def streamer(x): #method to check stats of an individual streamer
    return df.loc[df['Channel']==x]

In [1]:
def lang(x): #method to check the details about a streamer that streams in a particular language
        return df[df['Language'] == x][['Channel','Followers','Partnered','Mature']].head(10)

In [1]:
streamer('Anomaly')

In [1]:
lang('Spanish')

In [1]:
plt.figure(figsize=(12,8)) #comparing streaming time v/s followers gained
sns.lineplot(df['Stream time(minutes)'], df['Followers gained'], palette = "Set1")
plt.title('Streaming time v/s Followers gained', fontsize = 20)
plt.show()

In [1]:
plt.figure(figsize=(12,8)) #comparing streaming time v/s average viewers
sns.lineplot(df['Stream time(minutes)'], df['Average viewers'], palette = "Set1")
plt.title('Streaming time v/s Average Viewers', fontsize = 20)
plt.show()

# TRAINING

In [1]:
X = df[['Watch time(Minutes)','Stream time(minutes)','Peak viewers','Average viewers','Followers','Views gained']]
y = df['Followers gained']

## 1. XGBOOST

In [1]:
from xgboost import XGBRegressor

y = df['Followers gained']
X = df.drop(['Followers gained'], axis=1).select_dtypes(exclude=['object'])
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25)

train_X = train_X.select_dtypes(exclude=['object'])
test_X = test_X.select_dtypes(exclude=['object'])

model_xgb = XGBRegressor(n_estimators=1000)
# Add silent=True to avoid printing out updates with each cycle
model_xgb.fit(train_X, train_y, early_stopping_rounds=5, eval_set=[(test_X, test_y)], verbose=False)

predictions_XGB = model_xgb.predict(test_X)

## 2. CatBoost Regressor

In [1]:
from catboost import CatBoostRegressor

y, X, train_X, test_X, train_y, test_y, cbr, cb_pred = [None] * 8
y = df['Followers gained']
X = df.drop(['Channel', 'Followers gained'], axis=1)

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25)


In [1]:
# CATBOOST
# catmodel = CatBoostRegressor(iterations=2,
#                           learning_rate=1,
#                           depth=2)
cbr = CatBoostRegressor(logging_level='Silent', random_state=45, 
                        early_stopping_rounds=300, )

cbr.fit(train_X, train_y, cat_features=[6,7,8], plot=True)
cb_pred = cbr.predict(test_X, verbose=True)

## 3. Evaluate Results

In [1]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# XGBoost
print('--------------- XGBOOST -----------')
print('r2 score: '+str(r2_score(test_y, predictions_XGB)))
print('RMSE : '+str(np.sqrt(mean_squared_error(test_y, predictions_XGB))))
print("Mean Absolute Error : " + str(mean_absolute_error(predictions_XGB, test_y)))

# CatBoost
print('\n--------------- CATBOOST -----------')
print('r2 score: '+str(r2_score(test_y, cb_pred)))
print('RMSE : '+str(np.sqrt(mean_squared_error(test_y, cb_pred))))
print("Mean Absolute Error : " + str(mean_absolute_error(cb_pred, test_y)))



# SUBMISSION

In [1]:
pred_xgb = model_xgb.predict(df_test.select_dtypes(exclude=['object']))
pred_cat = cbr.predict(df_test.drop(['Channel'], axis=1), verbose=True)

In [1]:
submission_result = pd.DataFrame({'Channel': df_test.Channel, 'Followers gained':  pred_cat})

In [1]:
submission_result.to_csv('submission.csv', index=False)