# **Twitch Streamer Analysis**

# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Reading the Data

In [None]:
df = pd.read_csv('/kaggle/input/twitchdata/twitchdata-update.csv')
df

# Understanding the Data

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
df.size

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.corr()

In [None]:
df.nunique()

In [None]:
df.isnull().sum()

# Exploratory Data Analysis

In [None]:
import missingno as no
no.bar(df, color='lightgreen')

In [None]:
sns.heatmap(df.isnull(), yticklabels='False', cmap='Oranges')

In [None]:
df.plot()

In [None]:
plt.figure(figsize=(10,8))
df1 = df['Language'].value_counts()
plt.pie(df1.values, labels=df1.index, autopct='%0.2f%%')
plt.title('Percentage of Language', fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(20,8))
sns.barplot(x='Channel',y='Views gained',data=df[:15].sort_values(by='Views gained',ascending=False),palette='Paired')
plt.title("Top views gained Channels are", fontsize=15)

In [None]:
plt.figure(figsize=(14,7))
plt.style.use('seaborn-darkgrid')
sns.lineplot(x="Stream time(minutes)" , y="Average viewers" , hue="Language" , data=df)
plt.title('Streaming time v/s Average Viewers', fontsize = 20)
plt.show()

In [None]:
plt.figure(figsize=(12,7)) 
plt.style.use('seaborn-darkgrid')
sns.lineplot(df['Stream time(minutes)'], df['Followers gained'])
plt.title('Streaming time v/s Followers gained', fontsize = 20)
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.scatterplot(x=df.Followers , y=df['Average viewers'], hue=df.Language )
plt.show()

In [None]:
plt.figure(figsize=(18,6))
sns.barplot(x=df.Language, y=df.Followers, palette='rainbow')
plt.title('Followers in Language', fontsize=15)

In [None]:
plt.figure(figsize=(18,8))
sns.violinplot(x=df.Language, y=df['Average viewers'], palette='Set2')

In [None]:
plt.figure(figsize=(8, 6))
plot = sns.countplot(x="Mature", data=df, order=df['Mature'].value_counts().index)

In [None]:
plt.figure(figsize=(8, 6))
plot = sns.countplot(x="Partnered", data=df, palette='magma_r', order=df['Partnered'].value_counts().index)

In [None]:
plt.figure(figsize=(18,6))
sns.swarmplot(x="Language" , y = "Views gained" ,data=df)
plt.show()

In [None]:
sns.pairplot(df, hue = 'Language', palette='RdYlGn')
plt.show()

In [None]:
plt.figure(figsize=(18,12))
sns.heatmap(df.corr(), yticklabels='auto', annot=True, cmap='YlGn')
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoding = LabelEncoder()
categories=['Partnered',	'Mature',	'Language']
df[categories]=df[categories].apply(lambda x:labelencoding.fit_transform(x))
df

In [None]:
df = df.drop(['Channel'], axis=1)

In [None]:
df.dtypes

In [None]:
plt.figure(figsize=(10,50))
for i in range(len(df.columns)):
  plt.subplot(10, 1, i+1)
  sns.distplot(df[df.columns[i]], kde_kws={"color": "r", "lw": 3, "label": "KDE"}, hist_kws={"color": "b"})
  plt.title(df.columns[i])
  
plt.tight_layout()

# Splitting the Data into Dependent and Independent Variables

In [None]:
x = df.drop(['Followers gained', 'Partnered',	'Mature',	'Language'], axis=1)
y = df['Followers gained']

In [None]:
x.shape

# Feature Importance

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor()
model.fit(x,y)
print(model.feature_importances_)

In [None]:
feat_imp = pd.Series(model.feature_importances_, index=x.columns)

In [None]:
feat_imp.plot(kind='barh')

# Training and Testing the Data

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=25)

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(xtrain, ytrain)

## Prediction

In [None]:
ypred_train = lr.predict(xtrain)
ypred_test = lr.predict(xtest)

## Accuracy

In [None]:
from sklearn import metrics
print("Accuracy of training data:", metrics.r2_score(ytrain, ypred_train)*100)
ac1 = metrics.r2_score(ytest, ypred_test)*100
print("Accuracy of testing data:", ac1)

## Error

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(ytest, ypred_test))
print('Mean Squared Error:', metrics.mean_squared_error(ytest, ypred_test))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(ytest, ypred_test)))

# Decission Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(max_depth=5)
dt.fit(xtrain, ytrain)

## Prediction

In [None]:
ypred_train = dt.predict(xtrain)
ypred_test = dt.predict(xtest)

## Accuracy

In [None]:
from sklearn import metrics
print("Accuracy of training data:", metrics.r2_score(ytrain, ypred_train)*100)
ac2 = metrics.r2_score(ytest, ypred_test)*100
print("Accuracy of testing data:", ac2)

## Error

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(ytest, ypred_test))
print('Mean Squared Error:', metrics.mean_squared_error(ytest, ypred_test))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(ytest, ypred_test)))

# Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(max_depth=8, ccp_alpha=0.05)
rf.fit(xtrain, ytrain)

## Prediction

In [None]:
ypred_train = rf.predict(xtrain)
ypred_test = rf.predict(xtest)

## Accuracy

In [None]:
from sklearn import metrics
print("Accuracy of training data:", metrics.r2_score(ytrain, ypred_train)*100)
ac3 = metrics.r2_score(ytest, ypred_test)*100
print("Accuracy of testing data:", ac3)

## Error

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(ytest, ypred_test))
print('Mean Squared Error:', metrics.mean_squared_error(ytest, ypred_test))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(ytest, ypred_test)))

# Comparing Accuracy

In [None]:
accuracy =  {ac1: 'Logistic Regression', ac2: 'Decission Tree', ac3:'Random Forest'}

In [None]:
sns.set_style('darkgrid')
plt.figure(figsize=(14, 10))
model_accuracies = list(accuracy.values())
model_names = list(accuracy.keys())
sns.barplot(x=model_accuracies, y=model_names, palette='rainbow')

As accuracy of **Random Forest Regressor** algorithm is more ie. **63.58%** ~ **64%**

Hence we will save the model.

# Saving the Model

In [None]:
import pickle
pickle.dump(rf, open('model.pkl', 'wb'))

# Sample Prediction

In [None]:
p = rf.predict([[6196161750,	215250,	222720,	27716,	3246298,	93036735	]])
print("Followers Gained = ", int(p))