In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import pickle
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import neighbors
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))


def train_classifier(features_train, features_test, label_train, label_test, classifier):
    if classifier == "Logistic_Regression":
        model = LogisticRegression(C=1.)
    elif classifier == "Naive_Bayes":
        model = MultinomialNB()
    elif classifier == "SVM":
        model = SVC()
    elif classifier == "Linear":
        model = LinearRegression()    
    elif classifier == "Random_Forest":
        model = RandomForestClassifier(n_estimators=400, random_state=11)
    elif classifier == "Kmeans":
        knn = neighbors.KNeighborsRegressor()
        params = {'n_neighbors':[2,3,4,5,6,7,8,9]}
        model = GridSearchCV(knn, params, cv=5)
    else:
        print("Incorrect Selection Of Classifier")

    model.fit(features_train, label_train)
    print("Model Fitting Done")

    fileName = './Sentiment_models/' + classifier + '.pickle'
    with open(fileName, 'wb') as file:
        pickle.dump(model, file)
    print("Pickle File Created %s" % fileName)

    accuracy = model.score(features_test, label_test)
    print("Accuracy Is:", accuracy)

    return model

In [3]:
#ID,Title,Air_Date,Production_Code,Season,Episode_No.,Total_Episodes_Till_Now,US_Viewers_In_Millions,Views,IMDB_Rating,IMDB_Votes,Image_URL,Video_URL,Retweets,Favorites,Vader_Score,Sentiment_Score,Tweets_Per_Day,Unique_Users

fileName="simpsons_episodes.csv"
my_df = pd.read_csv(fileName)

print("File read")
for i in ['ID','Title','Air_Date','Production_Code','Season','Episode_No.',
          'Total_Episodes_Till_Now','Image_URL','Video_URL']:
    del my_df[i]
print(my_df.head())
my_df_temp =my_df   
     

File read
   US_Viewers_In_Millions    Views  IMDB_Rating  IMDB_Votes  Retweets  \
0                 8650000  36227.0          6.8       481.0       0.0   
1                14620000  40194.0          6.7       552.0       0.0   
2                 5110000  40854.0          7.1       532.0       0.0   
3                 5870000  44945.0          6.8       525.0       0.0   
4                 6080000  41059.0          6.6       496.0       0.0   

   Favorites  Vader_Score  Sentiment_Score  Tweets_Per_Day  Unique_Users  
0        0.0       0.0000              0.0        0.000000             0  
1        0.0       0.1806             28.0       26.285714           143  
2        0.0       0.0000              0.0       39.333333           566  
3        0.0       0.0000              0.0       23.785714           278  
4        0.0       0.0000              0.0       24.714286           139  


In [4]:
my_df.dropna(inplace=True)
my_df.reset_index(drop=True,inplace=True)
my_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Data columns (total 10 columns):
US_Viewers_In_Millions    146 non-null int64
Views                     146 non-null float64
IMDB_Rating               146 non-null float64
IMDB_Votes                146 non-null float64
Retweets                  146 non-null float64
Favorites                 146 non-null float64
Vader_Score               146 non-null float64
Sentiment_Score           146 non-null float64
Tweets_Per_Day            146 non-null float64
Unique_Users              146 non-null int64
dtypes: float64(8), int64(2)
memory usage: 11.5 KB


In [5]:
# x = my_df_temp.drop('US_Viewers_In_Millions', axis=1)  
# y = my_df['US_Viewers_In_Millions']

X = my_df.loc[:, ['Views','IMDB_Rating','IMDB_Votes','Retweets','Favorites','Vader_Score',
               'Sentiment_Score','Tweets_Per_Day','Unique_Users']]

y = my_df.loc[:, ['US_Viewers_In_Millions']]
print(X.head())
print(y.head())


     Views  IMDB_Rating  IMDB_Votes  Retweets  Favorites  Vader_Score  \
0  36227.0          6.8       481.0       0.0        0.0       0.0000   
1  40194.0          6.7       552.0       0.0        0.0       0.1806   
2  40854.0          7.1       532.0       0.0        0.0       0.0000   
3  44945.0          6.8       525.0       0.0        0.0       0.0000   
4  41059.0          6.6       496.0       0.0        0.0       0.0000   

   Sentiment_Score  Tweets_Per_Day  Unique_Users  
0              0.0        0.000000             0  
1             28.0       26.285714           143  
2              0.0       39.333333           566  
3              0.0       23.785714           278  
4              0.0       24.714286           139  
   US_Viewers_In_Millions
0                 8650000
1                14620000
2                 5110000
3                 5870000
4                 6080000


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=0)
# let's print shape of each train and testing
print("Shape of X_train: ", X_train.shape)
print("Shape of y_train: ", y_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_test", y_test.shape)

Shape of X_train:  (131, 9)
Shape of y_train:  (131, 1)
Shape of X_test:  (15, 9)
Shape of y_test (15, 1)


In [7]:
print("Model Training Started")
algorithm = "Linear"
model = train_classifier(X_train,X_test, y_train, y_test, algorithm)
print("Model Training Complete")


Model Training Started
Model Fitting Done
Pickle File Created ./Sentiment_models/Linear.pickle
Accuracy Is: 0.18724254715046196
Model Training Complete


In [1]:
import os
import tensorflow as tf
os.environ['TF_CPP_MIN_LOG_LEVEL'] = "99"


In [2]:
# Test with a simple computation
import tensorflow as tf
