In [20]:
#import packages
#pip install textblob
#pip install keras
#pip install tensorflow
import time
from textblob import TextBlob
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import keras.optimizers
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [4]:
#df_09= pd.read_pickle(r"C:\Users\danie\Documents\GitHub\Masters-Thesis\2009_preprocessed_date.pickle") 
df_09= pd.read_pickle(r"/Users/yolandaferreirofranchi/Desktop/2009_text_wo_names.pickle")

In [5]:
def absolute_count(male_col, female_col):
    if female_col > male_col and male_col == 0:
        return 1
    elif male_col> female_col and female_col ==0: 
        return 0
    else: 
        return None

#PP Data
df_09['col_type'] = df_09.apply(lambda row: absolute_count(row['male_count'], row['female_count']),axis=1)
df_09= df_09[df_09["col_type"].notnull()]
df_09

Unnamed: 0,pre_processed_sent,male_count,female_count,Proper_noun_list,pn exists,sentences,article_id,year,col_type
9,"[leave, band, follow, bust, say, simply, could...",3,0,"[Noel, Gallagher]",True,Noel Gallagher left the Manchester band follow...,5048,2009,0.0
10,"[launch, clothing, line, earlier, year, admit,...",2,0,[Liam],,"""Liam launched his clothing line Pretty Green ...",5048,2009,0.0
11,"[thinking, next, step, musically, mind, say]",1,0,[],,"""I'm thinking of what the next step is musical...",5048,2009,0.0
13,"[people, able, buy, record]",1,0,[],,"""People will be able to buy his records.",5048,2009,0.0
33,"[come, back, hang, fellow, cast, member, mitch...",0,2,[Sam],,She's coming back as Sam Mitchell and was hang...,8981,2009,1.0
...,...,...,...,...,...,...,...,...,...
19327,"[report, speculate, mime, part, track, want, p...",0,2,[],,Some reports have speculated that she mimed pa...,2157826,2009,1.0
19328,"[know, dance, lot]",0,1,[],,"""I know she was dancing a lot.",2157826,2009,1.0
19329,"[know, mime, think, really, great, performance...",0,2,[],,I don't know if she was miming or not but I th...,2157826,2009,1.0
19330,"[legend, make, comeback, show, rendition, new,...",2,0,"[Pop, Robbie]",True,Pop legend Robbie Williams made his comeback o...,2157826,2009,0.0


In [6]:
def subjectivity(sentence):
    subjectivity = ""

    subjectivity = TextBlob(sentence).sentiment.subjectivity

    return subjectivity

def polarity(sentence):
    polarity = ""

    polarity = TextBlob(sentence).sentiment.polarity

    return polarity

In [7]:
df_09['subjectivity'] = df_09['sentences'].apply(subjectivity)
df_09['polarity'] = df_09['sentences'].apply(polarity)

In [46]:
df_09["subjectivity"].describe()

count    5342.000000
mean        0.357059
std         0.302850
min         0.000000
25%         0.000000
50%         0.350000
75%         0.550000
max         1.000000
Name: subjectivity, dtype: float64

In [48]:
#create a 3 point criteria from -1 to 1 (range of polarity)
def map_sentiment(value):
    if value <= -0.33: 
        return -1
    elif value >= 0.33:
        return 1
    else:
        return 0


df_09['sentiment'] = df_09['polarity'].apply(map_sentiment)
df_09['sentiment'] = df_09["sentiment"].astype(float)
df_09["sentiment"].value_counts()

 0.0    4188
 1.0     923
-1.0     231
Name: sentiment, dtype: int64

**RNN LSTM Model for Sentiment Analysis** 

In [40]:
def rnn_lstm(df, sentences_col, sentiment_col):
    #start timer 
    start_time = time.time()

    X = df[sentences_col]
    y = df[sentiment_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, random_state = 42)
    
    # Tokenize the data
    tokenizer = Tokenizer(num_words=1000)
    tokenizer.fit_on_texts(X_train)
    
    # Convert the texts to sequences
    X_train = tokenizer.texts_to_sequences(X_train)
    X_test = tokenizer.texts_to_sequences(X_test)
    
    # Pad the sequences to ensure equal length
    maxlen = 100
    X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
    X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
    
    # Define the LSTM model
    model_lstm = Sequential()
    model_lstm.add(Embedding(input_dim=1000, output_dim=64, input_length=maxlen))
    model_lstm.add(LSTM(64))
    model_lstm.add(Dense(1, activation='sigmoid'))
    model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', "mean_squared_error"])
    
    # Train the model
    model_lstm.fit(X_train, y_train, epochs=1, batch_size=32, validation_data=(X_test, y_test))
    
    # Evaluate the model
    score = model_lstm.evaluate(X_test, y_test, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])

    #add early stopping 
    earlystop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

    # Make predictions on the padded sequences
    y_pred = model_lstm.predict(X_test)
    
    #end timer 
    end_time = time.time()
    print(f"\nExecution time: {end_time - start_time:.2f} seconds")

    # print the average sentiment for a word based on the words found in the corpus of the model
    #(add code here)
    return X_test, y_pred

    

In [41]:
rnn_lstm(df_09, "sentences", "sentiment")

Test loss: 0.4599429666996002
Test accuracy: 0.8278765082359314


(array([[  5,   9,   1, ...,   0,   0,   0],
        [ 74,   8,  54, ...,   0,   0,   0],
        [  8, 256, 714, ...,   0,   0,   0],
        ...,
        [ 15, 474,  12, ...,   0,   0,   0],
        [123,   2,  27, ...,   0,   0,   0],
        [  1,  36,  77, ...,   0,   0,   0]], dtype=int32),
 array([[0.84174556],
        [0.8417457 ],
        [0.8417453 ],
        ...,
        [0.84174544],
        [0.8417454 ],
        [0.84174514]], dtype=float32))

**CNN Model for Sentiment Analysis (WE WILL NOT USE, SIMPLY FOR REFERENCE TO ANSWER DEFENSEN QUESTIONS)**

In [68]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Define X and y
X = df_09['sentences']
y = df_09['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

# Convert the texts to sequences
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

# Pad the sequences to ensure equal length
maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

# Define the CNN model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=maxlen))
model.add(Conv1D(128, 3, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', "mean_squared_error"])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

# Evaluate the model
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: -334.3150634765625
Test accuracy: 0.8661863207817078


**Table Comparing Model Results**

In [70]:
results = {' ': ['LSTM', 'CNN'],
           'Accuracy': [0.795, 0.866],
           'Mean Squared Error': [0.193, 0.1339],
           'Test loss': [0.3425, -334.315]}

# Create a pandas dataframe from the dictionary
df = pd.DataFrame(results)

# Set the index of the dataframe to the Kernel column
df.set_index(' ', inplace=True)

# Display the dataframe
print(df)

      Accuracy  Mean Squared Error  Test loss
                                             
LSTM     0.795              0.1930     0.3425
CNN      0.866              0.1339  -334.3150
