## Predicting Bigg Boss Telugu (బిగ్ బాస్ తెలుగు) Season 3 Winner

## Using Python scikit learn https://satya-python.blogspot.com/

### Importing Required Python Libraries

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 50)

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score,confusion_matrix,roc_curve,auc
from sklearn.preprocessing import StandardScaler,PolynomialFeatures,QuantileTransformer
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings('ignore')

### Check and import dataset

In [None]:
!cd /kaggle/input/bigg-boss-india-hindi-telugu-tamil-kannada; ls -tlr

In [None]:
bigg_boss = pd.read_csv('/kaggle/input/bigg-boss-india-hindi-telugu-tamil-kannada/Bigg_Boss_India.csv', encoding = "ISO-8859-1")
nRow, nCol = bigg_boss.shape
print(f'There are {nRow} rows and {nCol} columns')

## Exploratory Data Analysis (EDA)

In [None]:
bigg_boss.head(5)

In [None]:
bigg_boss.tail(10).T

In [None]:
bigg_boss.sample(10)

In [None]:
bigg_boss.info()

In [None]:
# Unique values in each column
for col in bigg_boss.columns:
    print("Number of unique values in", col,"-", bigg_boss[col].nunique())

In [None]:
# Number of seasons in all Indian languages
print(bigg_boss.groupby('Language')['Season Number'].nunique().sum())

# 31 seasons happened (including current seasons)

## Bigg Boss Hindi has many seasons compared to other Indian languages. So, number of housemates are more in Hindi.

In [None]:
# Number of seasons in each Indian language
print(bigg_boss.groupby('Language')['Season Number'].nunique().nlargest(10))

In [None]:
# Total number of Bigg Boss housemates
fig = plt.figure(figsize=(10,4))
ax = sns.countplot(x='Language', data=bigg_boss)
ax.set_title('Bigg Boss Series - Indian Language')
for t in ax.patches:
    if (np.isnan(float(t.get_height()))):
        ax.annotate(0, (t.get_x(), 0))
    else:
        ax.annotate(str(format(int(t.get_height()), ',d')), (t.get_x(), t.get_height()*1.01))

## In Bigg Boss India seasons, most of the housemates (87%) entered in first day/week

In [None]:
# Number of normal entries and wild card entries
print(bigg_boss['Wild Card'].value_counts(), "\n")
print(round(bigg_boss['Wild Card'].value_counts(normalize=True)*100))
sns.countplot(x='Wild Card', data=bigg_boss)

## Number of film actress entered into the Bigg Boss houses, are more when compared to other professions 

In [None]:
# Participant's Profession
print(bigg_boss['Profession'].value_counts())
fig = plt.figure(figsize=(20,5))
sns.countplot(x='Profession', data=bigg_boss)
plt.xticks(rotation=90)

In [None]:
# Broadcastor
fig = plt.figure(figsize=(20,5))
ax = sns.countplot(x='Broadcasted By', data=bigg_boss, palette='RdBu')
ax.set_title('Bigg Boss Series - Indian Broadcastor & Total Number of Housemates')
for t in ax.patches:
    if (np.isnan(float(t.get_height()))):
        ax.annotate(0, (t.get_x(), 0))
    else:
        ax.annotate(str(format(int(t.get_height()), ',d')), (t.get_x(), t.get_height()*1.01))

## Salman Khan hosted most number of seasons (in Bigg Boss Hindi), Sudeep is next in the list

In [None]:
bigg_boss.groupby('Host Name')['Season Number'].nunique().nlargest(25)

## In all Bigg Boss languages, and in all seasons, Female contestants are more

In [None]:
# Housemate's Gender
print(bigg_boss['Gender'].value_counts())

# Female         221
# Male           192
# Transgender      3

### Only 3 Transgenders participated in all Indian languages

In [None]:
# Maximum TRP of Bigg Boss Hindi/India seasons
print("Maximum TRP",bigg_boss['Average TRP'].max(), "\n")
print(bigg_boss.loc[bigg_boss['Average TRP']==bigg_boss['Average TRP'].max()][["Language","Season Number"]].head(1).to_string(index=False))

## https://satya-data.blogspot.com/2018/01/bigg-boss-data-set-bigg-boss.html

In [None]:
# All BB Winners
bigg_boss.loc[bigg_boss.Winner==1]

In [None]:
# Profession of BB Season Winners
bigg_boss.loc[bigg_boss.Winner==1,'Profession'].value_counts()

In [None]:
# Gender of Season title Winners
bigg_boss.loc[bigg_boss.Winner==1,'Gender'].value_counts()

# Male      17
# Female     9

In [None]:
# Number of eliminations or evictions faced by the Bigg Boss competition winners
bigg_boss.loc[bigg_boss.Winner==1,'Number of Evictions Faced'].value_counts().sort_index()

## No wild card entry housemate won the Bigg Boss competition.

In [None]:
# Entry type of the Season Winners
bigg_boss.loc[bigg_boss.Winner==1,'Wild Card'].value_counts()

In [None]:
# No re-entered contestant won Bigg Boss title
bigg_boss.loc[bigg_boss.Winner==1,'Number of re-entries'].value_counts()

In [None]:
bigg_boss.loc[bigg_boss.Winner==1,'Number of times elected as Captain'].value_counts().sort_index()
# data is not up-to-date

## BB Telugu Seasons

In [None]:
bigg_boss.loc[(bigg_boss['Language']=='Telugu')]

In [None]:
# Bigg Boss Telugu Winners
bigg_boss.loc[(bigg_boss['Language']=='Telugu') & (bigg_boss['Winner']==1), :]

In [None]:
# Bigg Boss Telugu current season participants
bigg_boss.loc[(bigg_boss['Language']=='Telugu') & (bigg_boss['Winner'].isnull()), :]

## Preparing Data for ML modelling

In [None]:
# Handling NULL values
bigg_boss.isnull().sum()

In [None]:
# Removing records where Name field is empty
bigg_boss = bigg_boss.loc[bigg_boss.Name.notnull()]
bigg_boss.reset_index(drop=True,inplace=True)

In [None]:
# Contestant might have faced at least one eviction, so filling NaN with 'Number of Evictions Faced' with 1
bigg_boss['Number of Evictions Faced'] = bigg_boss['Number of Evictions Faced'].fillna(1)

# Number of re-entries are very less, so filling NULLs in 'Number of re-entries' with 0
bigg_boss['Number of re-entries'] = bigg_boss['Number of re-entries'].fillna(0)

# Filling blank values in 'Average TRP' column with average
bigg_boss['Average TRP'] = bigg_boss['Average TRP'].fillna(bigg_boss['Average TRP'].mean())

In [None]:
bigg_boss['Season Start Date'] = pd.to_datetime(bigg_boss['Season Start Date'])
bigg_boss['Season End Date'] = pd.to_datetime(bigg_boss['Season End Date'])
bigg_boss['Entry Date'] = pd.to_datetime(bigg_boss['Entry Date'])
bigg_boss['Elimination Date'] = pd.to_datetime(bigg_boss['Elimination Date'])

In [None]:
bigg_boss.head()

In [None]:
bigg_boss.tail()

In [None]:
train = bigg_boss.loc[(bigg_boss['Winner'].notnull()), :]
train.sample(10)

In [None]:
test = bigg_boss.loc[(bigg_boss['Language']=='Telugu') & (bigg_boss['Winner'].isnull()), :]
test

In [None]:
#BB_telugu_participant = test[['Name','Winner']]
BB_telugu_participant = test[['Name']]
BB_telugu_participant.reset_index(drop=True,inplace=True)
BB_telugu_participant

In [None]:
train.drop(["Name","Entry Date","Elimination Date","Season Start Date","Season End Date","Elimination Week Number"], axis=1, inplace=True)
test.drop(["Name","Entry Date","Elimination Date","Season Start Date","Season End Date","Elimination Week Number","Winner"], axis=1, inplace=True)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
# Spread of target variable
print(train['Winner'].value_counts(normalize=True)*100)

In [None]:
# One Hot Encoding

target = train.pop('Winner')
data = pd.concat([train, test])
dummies = pd.get_dummies(data, columns=data.columns, drop_first=True, sparse=True)
train2 = dummies.iloc[:train.shape[0], :]
test = dummies.iloc[train.shape[0]:, :]

In [None]:
print(train2.shape)
print(test.shape)

In [None]:
train2.head()

In [None]:
target.values

In [None]:
x_train, x_val, y_train, y_val = train_test_split(train2, target, test_size=0.3, random_state=2019)
print(x_train.shape, x_val.shape)

## Machine Learning Modelling

In [None]:
# Logistic Regression
for c in [0.01, 1, 10, 100, 1000]:
    lr = LogisticRegression(random_state=2019, C=c).fit(x_train, y_train)
    print ("F1 score for C=%s: %s" % (c, f1_score(y_val, lr.predict(x_val), average='weighted')*100))

In [None]:
logi = LogisticRegression(random_state=2019,C=10).fit(x_train, y_train)
logi

In [None]:
predicted_val_logi = logi.predict_proba(x_val)[:, 1]
y_predicted_val = (predicted_val_logi > 0.3).astype("int").ravel()
print(f1_score(y_val, y_predicted_val, average='weighted')*100)

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_val, y_predicted_val).T
cm = cm.astype('float')/cm.sum(axis=0)
ax = sns.heatmap(cm, annot=True, cmap='Blues');
ax.set_xlabel('True Label',size=12)
ax.set_ylabel('Predicted Label',size=12)

# TP 1 TN 0.78

In [None]:
predicted_val_logi = logi.predict_proba(test)[:, 1]
predicted_val_logi

In [None]:
winner = pd.concat([BB_telugu_participant, pd.DataFrame(pred_val_logi, columns=['Predicted_Winner'])],axis=1)
#winner = winner.loc[winner['Winner'].isnull()].reset_index(drop=True)
winner[['Name','Predicted_Winner']]

## Predicted Winner for Bigg Boss Telugu Season 3, as per Logistic Regression

In [None]:
# Predicted Winner for Bigg Boss Telugu Season 3, as per LogisticRegression
winner.iloc[winner.Predicted_Winner.argmax()]['Name']

![Rahul](https://upload.wikimedia.org/wikipedia/te/f/f9/Rahul_Sipligunj.jpg)

In [None]:
# RandomForest
rf = RandomForestClassifier(n_estimators=200, random_state=2019).fit(x_train, y_train)
rf

In [None]:
pred_val_logi = rf.predict_proba(x_val)[:, 1]
y_pred_val = (pred_val_logi > 0.3).astype("int").ravel()
print(f1_score(y_val, y_pred_val, average='weighted')*100)

# n_estimators=100 accuracy 99.36
# n_estimators=200 accuracy 100

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_val, y_pred_val).T
cm = cm.astype('float')/cm.sum(axis=0)
ax = sns.heatmap(cm, annot=True, cmap='Blues');
ax.set_xlabel('True Label',size=12)
ax.set_ylabel('Predicted Label',size=12)

# TP 1 TN 0.78
# TP 1 TN 1

In [None]:
pred_val_rf = rf.predict_proba(test)[:,1]
pred_val_rf

In [None]:
winner = pd.concat([BB_telugu_participant, pd.DataFrame(pred_val_rf, columns=['Predicted_Winner'])],axis=1)
#winner = winner.loc[winner['Winner'].isnull()].reset_index(drop=True)
winner[['Name','Predicted_Winner']]

## Predicted Winner for Bigg Boss Telugu Season 3, as per Random Forest

In [None]:
# Predicted Winner for Bigg Boss Telugu Season 3, as per RandomForest
winner.iloc[winner.Predicted_Winner.argmax()]['Name']

![Rahul](https://upload.wikimedia.org/wikipedia/te/f/f9/Rahul_Sipligunj.jpg)
<img isrc="https://i0.wp.com/www.newsbugz.com/wp-content/uploads/2018/01/26063636_2015962868641181_6491273729258126947_o.jpg" width="40%">

## Neaural Networks (MLP)

In [None]:
clf_NN = MLPClassifier(random_state=2019)
#clf_NN = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=2019)
clf_NN.fit(x_train, y_train)

In [None]:
predictproba_NN = clf_NN.predict_proba(x_val)[:,1]
y_pred_val = (predictproba_NN > 0.5).astype("int").ravel()
print(f1_score(y_val, y_pred_val, average='weighted')*100)

In [None]:
NNAccuracy = accuracy_score(y_val, y_pred_val)
print(NNAccuracy)

In [None]:
# Receiver Operating Characteristic
sns.set('talk', 'whitegrid', 'dark', font_scale=1, font='Ricty',rc={"lines.linewidth": 2, 'grid.linestyle': '--'})

def plotAUC(truth, pred, lab):
    fpr, tpr, _ = roc_curve(truth,pred)
    roc_auc = auc(fpr, tpr)
    lw = 2
    c = (np.random.rand(), np.random.rand(), np.random.rand())
    plt.plot(fpr, tpr, color= c,lw=lw, label= lab +'(AUC = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve')
    plt.legend(loc="lower right")

In [None]:
plotAUC(y_val, predictproba_NN, 'MLP')
plt.show()

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_val, y_pred_val).T
cm = cm.astype('float')/cm.sum(axis=0)
ax = sns.heatmap(cm, annot=True, cmap='Blues');
ax.set_xlabel('True Label',size=12)
ax.set_ylabel('Predicted Label',size=12)

In [None]:
Y_pred_test = clf_NN.predict(test)
Y_pred_test