In [17]:
# connect colab to google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
import pandas as pd
import numpy as np
from collections import Counter

from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.util import mark_negation
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

# Part 1

Ref: - https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment/

### Load and process data

In [19]:
# Load Twitter Airline Sentiment dataset
df = pd.read_csv('/content/drive/MyDrive/News Datasets/Shreya_Zope_Tweets_dataset.csv')

# Step 2: Data Preprocessing
df['text'] = df['text'].str.lower()  # convert to lower case

# Step 3: Stop Words Removal
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: ' '.join(word for word in word_tokenize(x) if word not in stop_words))


### Split train test

In [20]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['airline_sentiment'],
                                                    test_size=0.2, random_state=42)


### Training function

In [21]:
def train_and_test_cross_val_k_fold(X, y, X_test, y_test):

    # Define the K-fold Cross Validator
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # K-fold Cross Validation model evaluation
    fold_no = 1
    test_preds = []
    for train, val in kfold.split(X, y):

        nb_classifier = MultinomialNB()

        # Generate a print
        print(f'Training for fold {fold_no} ...\n')

        # Fit data to model
        nb_classifier.fit(X[train], y.iloc[train])

        # Generate a prediction of the test set
        test_preds.append(nb_classifier.predict(X_test))

        # Increase fold number
        fold_no = fold_no + 1

    # Transpose the list of predictions
    test_preds = np.array(test_preds).T

    # Use majority voting to get the final prediction
    final_preds = [Counter(x).most_common(1)[0][0] for x in test_preds]

    # Now you can calculate precision, recall, f1-score
    precision = precision_score(y_test, final_preds, average='weighted')
    recall = recall_score(y_test, final_preds, average='weighted')
    f1 = f1_score(y_test, final_preds, average='weighted')

    print('Test Precision:', precision)
    print('Test Recall:', recall)
    print('Test F1 Score:', f1)


### Experiment 1: Unigram Features


In [22]:
# Experiment 1: Unigram Features
vectorizer_nb = CountVectorizer(max_features=1000)
X_train_unigram = vectorizer_nb.fit_transform(X_train)
X_test_unigram = vectorizer_nb.transform(X_test)

# Model Training and Cross-Validation
nb_classifier = MultinomialNB()
scoring = ['precision_weighted', 'recall_weighted', 'f1_weighted']
scores = cross_validate(nb_classifier, X_train_unigram, y_train, cv=5, scoring=scoring)

print('Unigram Features:')
print('Cross-Validation Precision:', scores['test_precision_weighted'])
print('Cross-Validation Recall:', scores['test_recall_weighted'])
print('Cross-Validation F1 Score:', scores['test_f1_weighted'])



Unigram Features:
Cross-Validation Precision: [0.73701886 0.74750793 0.74137439 0.75419228 0.73233046]
Cross-Validation Recall: [0.74434486 0.75373453 0.74978651 0.76131512 0.73654996]
Cross-Validation F1 Score: [0.73989769 0.74972848 0.74452567 0.75648724 0.73406306]


In [23]:
train_and_test_cross_val_k_fold(X_train_unigram, y_train, X_test_unigram, y_test)

Training for fold 1 ...

Training for fold 2 ...

Training for fold 3 ...

Training for fold 4 ...

Training for fold 5 ...

Test Precision: 0.7586979143793433
Test Recall: 0.7636612021857924
Test F1 Score: 0.7608362316463679


### Experiment 2: Bigram Features


In [24]:
vectorizer_nb = CountVectorizer(ngram_range=(1, 2), max_features=1000)
X_train_bigram = vectorizer_nb.fit_transform(X_train)
X_test_bigram = vectorizer_nb.transform(X_test)

# # Model Training and Cross-Validation
# classifier = MultinomialNB()
# scoring = ['precision_weighted', 'recall_weighted', 'f1_weighted']
# scores = cross_validate(classifier, X_train_bigram, y_train, cv=5, scoring=scoring)

# print('Bigram Features:')
# print('Cross-Validation Precision:', scores['test_precision_weighted'])
# print('Cross-Validation Recall:', scores['test_recall_weighted'])
# print('Cross-Validation F1 Score:', scores['test_f1_weighted'])



In [25]:
train_and_test_cross_val_k_fold(X_train_bigram, y_train, X_test_bigram, y_test)

Training for fold 1 ...

Training for fold 2 ...

Training for fold 3 ...

Training for fold 4 ...

Training for fold 5 ...

Test Precision: 0.7516114442746965
Test Recall: 0.7561475409836066
Test F1 Score: 0.7536181000147473


### Model building v2 - Feature exploration

### Experiment 3: Negation Handling

In [26]:
X_train_negation = X_train.apply(lambda x: ' '.join(mark_negation(x.split())))
X_test_negation = X_test.apply(lambda x: ' '.join(mark_negation(x.split())))

vectorizer_nb = CountVectorizer(max_features=1000)
X_train_negation = vectorizer_nb.fit_transform(X_train_negation)
X_test_negation = vectorizer_nb.transform(X_test_negation)

# # Model Training and Evaluation
# classifier = MultinomialNB()
# classifier.fit(X_train_negation, y_train)
# y_pred = classifier.predict(X_test_negation)
# print('\nNegation Handling:')
# print('Precision:', precision_score(y_test, y_pred, average='weighted'))
# print('Recall:', recall_score(y_test, y_pred, average='weighted'))
# print('F1 Score:', f1_score(y_test, y_pred, average='weighted'))


In [27]:
train_and_test_cross_val_k_fold(X_train_negation, y_train, X_test_negation, y_test)

Training for fold 1 ...

Training for fold 2 ...

Training for fold 3 ...

Training for fold 4 ...

Training for fold 5 ...

Test Precision: 0.754288171315225
Test Recall: 0.7595628415300546
Test F1 Score: 0.7565653717048993


### Experiment 4: Sentiment Lexicon

In [28]:
sid = SentimentIntensityAnalyzer()
df['sentiment_scores'] = df['text'].apply(lambda x: sid.polarity_scores(x)['compound'])

X_train_lex, X_test_lex, y_train_lex, y_test_lex = train_test_split(df[['text', 'sentiment_scores']], df['airline_sentiment'], test_size=0.2, random_state=42)

vectorizer = CountVectorizer(max_features=1000)
X_train_lexicon = vectorizer.fit_transform(X_train_lex['text'])
X_test_lexicon = vectorizer.transform(X_test_lex['text'])

# Add sentiment scores as additional features
X_train_lexicon = pd.concat([pd.DataFrame(X_train_lexicon.toarray()), X_train_lex['sentiment_scores'].reset_index(drop=True)], axis=1)
X_train_lexicon.columns = X_train_lexicon.columns.astype(str)

X_test_lexicon = pd.concat([pd.DataFrame(X_test_lexicon.toarray()), X_test_lex['sentiment_scores'].reset_index(drop=True)], axis=1)
X_test_lexicon.columns = X_test_lexicon.columns.astype(str)

# Model Training and Evaluation
classifier = LogisticRegression()

classifier.fit(X_train_lexicon, y_train_lex)
y_pred = classifier.predict(X_test_lexicon)
print('\nSentiment Lexicon:')
print('Precision:', precision_score(y_test_lex, y_pred, average='weighted'))
print('Recall:', recall_score(y_test_lex, y_pred, average='weighted'))
print('F1 Score:', f1_score(y_test_lex, y_pred, average='weighted'))



Sentiment Lexicon:
Precision: 0.7917596578065299
Recall: 0.7957650273224044
F1 Score: 0.7934839059655487


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
# Highest scores obtained using lexicon features!

### Final Naive Bayes Model

In [30]:
# Unigram Features
vectorizer_nb = CountVectorizer(max_features=1000)
X_train_unigram_final = vectorizer_nb.fit_transform(df['text'])
y_train_final = df['airline_sentiment']

# Model Training and Cross-Validation
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_unigram_final, y_train_final)

# Part 2

In [32]:
# Load the Fake and Real news datasets
df_fake = pd.read_csv('/content/drive/MyDrive/News Datasets/Fake.csv').head(50)
df_real = pd.read_csv('/content/drive/MyDrive/News Datasets/True.csv').head(50)

# Preprocess the text
df_fake['processed_text'] = df_fake['text'].str.lower()
df_real['processed_text'] = df_real['text'].str.lower()

df_fake['processed_text'] = df_fake['processed_text'].apply(lambda x: ' '.join(word for word in word_tokenize(x) if word not in stop_words))
df_real['processed_text'] = df_real['processed_text'].apply(lambda x: ' '.join(word for word in word_tokenize(x) if word not in stop_words))


In [33]:
# Function to count positive and negative sentences in a text
def count_sentiments_lexicon_model(text):
    sentences_df = pd.DataFrame(sent_tokenize(text), columns=['text'])
    sentences_vec = vectorizer.transform(sentences_df['text'])
    sentences_df['sentiment_scores'] = sentences_df['text'].apply(lambda x: sid.polarity_scores(x)['compound'])

    test_lexicon = pd.concat([pd.DataFrame(sentences_vec.toarray()), sentences_df['sentiment_scores'].reset_index(drop=True)], axis=1)
    test_lexicon.columns = test_lexicon.columns.astype(str)
    preds = classifier.predict(test_lexicon)

    num_positive = np.sum(preds == 'positive')
    num_negative = np.sum(preds == 'negative')
    return pd.Series([num_positive, num_negative])

def count_sentiments_naive_bayes(text):
    sentences_df = pd.DataFrame(sent_tokenize(text), columns=['text'])
    sentences_vec = vectorizer_nb.transform(sentences_df['text'])
    preds = nb_classifier.predict(sentences_vec)
    num_positive = np.sum(preds == 'positive')
    num_negative = np.sum(preds == 'negative')
    return pd.Series([num_positive, num_negative])


In [34]:
# Apply the function to the Fake and Real news articles
df_fake[['num_positive', 'num_negative']] = df_fake['processed_text'].apply(count_sentiments_lexicon_model)
df_real[['num_positive', 'num_negative']] = df_real['processed_text'].apply(count_sentiments_lexicon_model)

# Save the results to csv files
df_fake[['text', 'num_positive', 'num_negative']].to_csv('fake_results.csv', index=False)
df_real[['text', 'num_positive', 'num_negative']].to_csv('real_results.csv', index=False)


In [35]:
# Calculate the average number of positive and negative sentences
avg_num_positive_fake = df_fake['num_positive'].mean()
avg_num_negative_fake = df_fake['num_negative'].mean()

print('Average number of positive sentences in Fake news articles:', avg_num_positive_fake)
print('Average number of negative sentences in Fake news articles:', avg_num_negative_fake)

if avg_num_positive_fake > avg_num_negative_fake:
    print('Fake news articles tend to contain more positive sentences.')
elif avg_num_positive_fake < avg_num_negative_fake:
    print('Fake news articles tend to contain more negative sentences.')
else:
    print('Fake news articles contain an equal number of positive and negative sentences.')


Average number of positive sentences in Fake news articles: 1.3
Average number of negative sentences in Fake news articles: 9.4
Fake news articles tend to contain more negative sentences.


# Discussion

In [36]:
df_fake.head(20)

Unnamed: 0,title,text,subject,date,processed_text,num_positive,num_negative
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",donald trump wish americans happy new year lea...,12,11
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",house intelligence committee chairman devin nu...,2,6
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017","friday , revealed former milwaukee sheriff dav...",0,20
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017","christmas day , donald trump announced would b...",0,11
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",pope francis used annual christmas day message...,1,11
5,Racist Alabama Cops Brutalize Black Boy While...,The number of cases of cops brutalizing and ki...,News,"December 25, 2017",number cases cops brutalizing killing people c...,0,10
6,"Fresh Off The Golf Course, Trump Lashes Out A...",Donald Trump spent a good portion of his day a...,News,"December 23, 2017",donald trump spent good portion day golf club ...,1,10
7,Trump Said Some INSANELY Racist Stuff Inside ...,In the wake of yet another court decision that...,News,"December 23, 2017",wake yet another court decision derailed donal...,0,7
8,Former CIA Director Slams Trump Over UN Bully...,Many people have raised the alarm regarding th...,News,"December 22, 2017",many people raised alarm regarding fact donald...,1,8
9,WATCH: Brand-New Pro-Trump Ad Features So Muc...,Just when you might have thought we d get a br...,News,"December 21, 2017",might thought get break watching people kiss d...,2,7


In [37]:
df_real.head(20)

Unnamed: 0,title,text,subject,date,processed_text,num_positive,num_negative
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",washington ( reuters ) - head conservative rep...,1,15
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",washington ( reuters ) - transgender people al...,1,12
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",washington ( reuters ) - special counsel inves...,3,7
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",washington ( reuters ) - trump campaign advise...,1,10
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",seattle/washington ( reuters ) - president don...,2,23
5,"White House, Congress prepare for talks on spe...","WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...",politicsNews,"December 29, 2017","west palm beach , fla./washington ( reuters ) ...",1,11
6,"Trump says Russia probe will be fair, but time...","WEST PALM BEACH, Fla (Reuters) - President Don...",politicsNews,"December 29, 2017","west palm beach , fla ( reuters ) - president ...",1,17
7,Factbox: Trump on Twitter (Dec 29) - Approval ...,The following statements were posted to the ve...,politicsNews,"December 29, 2017",following statements posted verified twitter a...,1,1
8,Trump on Twitter (Dec 28) - Global Warming,The following statements were posted to the ve...,politicsNews,"December 29, 2017",following statements posted verified twitter a...,1,1
9,Alabama official to certify Senator-elect Jone...,WASHINGTON (Reuters) - Alabama Secretary of St...,politicsNews,"December 28, 2017",washington ( reuters ) - alabama secretary sta...,0,2


### Features and Experiment

- **Unigram Features**: The initial experiment was conducted using unigram features, which are individual words. The CountVectorizer function was utilized to transform the text data into a matrix of token counts. The model was subsequently trained and evaluated using 5-fold cross-validation, providing a baseline for comparison with more complex feature sets.

- **Bigram Features**: The second experiment expanded the feature set to include bigram features, which are pairs of consecutive words. This was achieved by adjusting the ngram_range parameter of the CountVectorizer function to (1, 2), thereby including both unigrams and bigrams. The model was trained and evaluated using the same 5-fold cross-validation, allowing for a direct comparison of performance with the unigram model.

- **Negation Handling**: The third experiment incorporated negation handling into the feature extraction process. The nltk.sentiment.util.mark_negation function was used to append a "_NEG" suffix to words appearing within the scope of a negation and a punctuation mark. This allowed the model to better capture the semantic implications of negations in the text. The model was then trained and evaluated using 5-fold cross-validation.

- **Sentiment Lexicon**: The fourth experiment involved the use of a sentiment lexicon as an additional feature. The nltk.sentiment.vader.SentimentIntensityAnalyzer was used to compute sentiment scores for each text, which were then added as additional features to the feature matrix. A Logistic Regression model was trained and evaluated on this augmented data, providing a comparison of performance with the Naive Bayes models.

- **Final Naive Bayes Model**: The final model was trained using unigram features and the Multinomial Naive Bayes classifier. This model was then used to predict the sentiment of sentences in Fake and Real news articles, providing a practical application of the model.

- **Fake and Real News Analysis**: The final part of the assignment involved the analysis of Fake and Real news articles. The sentiment classifier was used to count the number of positive and negative sentences in each article. The average number of positive and negative sentences was then calculated for both Fake and Real news articles, providing insights into the sentiment distribution in these two types of news.

### Q: Explain how you examine whether the Fake content tends to contain more positive or negative sentences

- For each article in the Fake news dataset:
  - Tokenize the article into individual sentences.
  - Use the sentiment classifier to predict the sentiment of each sentence.
  - Count the number of sentences that were classified as 'positive' and 'negative'.

- After processing all articles in the Fake news dataset:
  - Calculate the average number of positive sentences across all Fake news articles.
  - Calculate the average number of negative sentences across all Fake news articles.

- Compare the average number of positive and negative sentences:
  - If the average number of positive sentences is greater than the average number of negative sentences, conclude that Fake news articles tend to contain more positive sentences.


### My takeaways

- **Feature Selection**: Learned that choosing the right features, like unigrams, bigrams, or sentiment scores, can really impact how well our model performs.

- **Model Selection**: Realized that different models work better with different features. For example, Naive Bayes was great with unigrams and bigrams, but Logistic Regression did better when we added sentiment scores.

- **Cross-Validation**: Got hands-on experience with cross-validation. It's a solid way to check how our model might do on unseen data.

- **Real-world Application**: Applying the model to real and fake news articles was cool. It showed how sentiment analysis can be used in real-life situations.

- **Reflections on Results**: The results were eye-opening. The sentiment lexicon model did better than the unigram and bigram models, but there's still room for improvement.

- **Reflections on Methodology**: The whole process of trying different features and models, checking how they did, and tweaking things was a great learning experience. It showed me that building a machine learning model is really an iterative process.