In [46]:
import twitter_news_scraper as tns
import text_normalizer as tn
import extractor as ex
import os
import pandas as pd

from datetime import datetime, date
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

# prepare parameters to find tweets with news
# to_date = date.today()
to_date = datetime(2022, 6, 1)
from_date = datetime(to_date.year - 1, to_date.month, to_date.day)
query = '"sports news" (cricket OR hockey OR football OR rugger OR tennis OR badminton OR volleyball OR \
race OR racing OR swimming OR gymnastics OR diving OR polo) lang:en'

# csv file where news items are saved
output_file = os.getcwd() + '\\news_items.csv'

# deduplicated news items
dedup_file = os.getcwd() + '\\dedup_items.csv'

# normalized train and test news items
train_norm_file = os.getcwd() + '\\train_normalized.csv'
test_norm_file = os.getcwd() + '\\test_normalized.csv'

In [3]:
# extract tweet news, no need to execute this step as already data is gathered in CSV file
news_tweets = tns.find_tweets_with_news(from_date, to_date, query, 20000, 15000)

News items found : 5


In [3]:
# extract news from twitter newslinks, no need to execute this step as already data is gathered in CSV file
tns.extract_news(news_tweets)

In [10]:
# optional step to save all news tweets and news text to csv file. Already done
news_tweets.to_csv(output_file, index=False)

In [3]:
# Since already data is gathered and saved in CSV, this step will quickly load it to dataframe
# recommended to follow this step
news_tweets = pd.read_csv(output_file)
print(output_file)

D:\project\assignment\Text_Analytics\twitter-nlp\news_items.csv


In [5]:
# Provide details of sentences, words, unique words distribution stats
# creating a new column will help to save unique words for future processing
news_tweets["unique_words"] = None
news_tweets = news_tweets.drop(columns=['date', 'user', 'tweet'])
news_tweets = tns.describe_news_statistics(news_tweets)

Sentences [0-100] Words [0-200] Unique Words [0-100] : 3654
Sentences [0-100] Words [0-200] Unique Words [100-200] : 8
Sentences [0-100] Words [200-400] Unique Words [0-100] : 1
Sentences [0-100] Words [200-400] Unique Words [100-200] : 26
Sentences [0-100] Words [200-400] Unique Words [200-300] : 39
Sentences [0-100] Words [400-600] Unique Words [100-200] : 8
Sentences [0-100] Words [400-600] Unique Words [200-300] : 150
Sentences [0-100] Words [400-600] Unique Words [300-400] : 41
Sentences [0-100] Words [600-800] Unique Words [100-200] : 4
Sentences [0-100] Words [600-800] Unique Words [200-300] : 17
Sentences [0-100] Words [600-800] Unique Words [300-400] : 199
Sentences [0-100] Words [600-800] Unique Words [400-500] : 42
Sentences [0-100] Words [600-800] Unique Words [500-600] : 1
Sentences [0-100] Words [800-1000] Unique Words [100-200] : 2
Sentences [0-100] Words [800-1000] Unique Words [200-300] : 3
Sentences [0-100] Words [800-1000] Unique Words [300-400] : 55
Sentences [0-100

In [2]:
# --------The commented code can clean the news data gathered by finding the duplicates and remove them
# --------and save them to dedup_items.csv file in the current directory--------------------------------

# clean all duplicate news items based on 80% match probability by default. in here 95% accuracy
dedup_df = tns.clean_duplicate_news(news_tweets, 95)

# saving is optional, already the file is saved
# dedup_df.to_csv(dedup_file, index=False)

# read already created deduplicated dataset
# dedup_df = pd.read_csv(dedup_file)
# describe details of sentences, words, unique words distribution stats
dedup_df = tns.describe_news_statistics(dedup_df)

Sentences [0-100] Words [0-200] Unique Words [0-100] : 50
Sentences [0-100] Words [0-200] Unique Words [100-200] : 8
Sentences [0-100] Words [200-400] Unique Words [100-200] : 22
Sentences [0-100] Words [200-400] Unique Words [200-300] : 30
Sentences [0-100] Words [400-600] Unique Words [100-200] : 6
Sentences [0-100] Words [400-600] Unique Words [200-300] : 103
Sentences [0-100] Words [400-600] Unique Words [300-400] : 29
Sentences [0-100] Words [600-800] Unique Words [100-200] : 4
Sentences [0-100] Words [600-800] Unique Words [200-300] : 12
Sentences [0-100] Words [600-800] Unique Words [300-400] : 113
Sentences [0-100] Words [600-800] Unique Words [400-500] : 25
Sentences [0-100] Words [600-800] Unique Words [500-600] : 1
Sentences [0-100] Words [800-1000] Unique Words [100-200] : 1
Sentences [0-100] Words [800-1000] Unique Words [200-300] : 3
Sentences [0-100] Words [800-1000] Unique Words [300-400] : 23
Sentences [0-100] Words [800-1000] Unique Words [400-500] : 103
Sentences [0-

In [6]:
# Creating a dataframe 300 data for testing and the rest for training
test_df = dedup_df.drop(columns=['unique_words']).sample(300)
test_df["normalized"] = None

# training dataset
train_df = dedup_df.drop(index=test_df.index, columns=['unique_words'])
train_df["normalized"] = None

# pre-processing the data for feature extraction and saving them in csv file for quick reference
# train_df = tn.normalize_text(train_df, True, True)
# train_df = train_df.drop(columns=['news_text'])
# train_df.to_csv(train_norm_file, index=False)

# test_df = tn.normalize_text(test_df, True, True)
# test_df = test_df.drop(columns=['news_text'])
# test_df.to_csv(test_norm_file, index=False)

In [47]:
train_df = pd.read_csv(train_norm_file)
test_df = pd.read_csv(test_norm_file)

vectorizer, features = ex.bag_of_words_extractor(train_df["normalized"].values.astype('U'))
feature_names = vectorizer.get_feature_names()
feature_matrix = features.todense()

# applying the same features on the test dataset
test_features = vectorizer.transform(test_df["normalized"].values.astype('U'))
test_matrix = test_features.todense()

# summarizes the feature distribution by feature ranges of 100
ex.feature_summary(feature_matrix, feature_names, 100)

News items per feature ranges distributions in 100s
 --------------------------------------------------
Features 0-100 : 45 news items
Features 100-200 : 34 news items
Features 200-300 : 93 news items
Features 300-400 : 88 news items
Features 400-500 : 128 news items
Features 500-600 : 116 news items
Features 600-700 : 113 news items
Features 700-800 : 105 news items
Features 800-900 : 69 news items
Features 900-1000 : 41 news items
Features 1000-1100 : 39 news items
Features 1100-1200 : 26 news items
Features 1200-1300 : 21 news items
Features 1300-1400 : 19 news items
Features 1400-1500 : 16 news items
Features 1500-1600 : 13 news items
Features 1600-1700 : 10 news items
Features 1700-1800 : 4 news items
Features 1800-1900 : 3 news items
Features 1900-2000 : 10 news items
Features 2000-2100 : 4 news items
Features 2100-2200 : 2 news items
Features 2200-2300 : 1 news items
Features 2300-2400 : 2 news items
Features 2600-2700 : 2 news items
Features 3000-3100 : 1 news items
Features 32

In [48]:
# Question f. build sports categorywise features to train on the train dataset
categories = ["cricket", "bowl", "batsman", "batting", "balling", "wicket", "runs", "goalkeeper", "hockey", 
              "football", "soccer", "rugger", "rugby", "tennis", "badminton", "volleyball", 
              "race", "racing", "swimming", "gymnastics", "diving", "polo", "baseball", 
              "chess", "shooting", "basketball", "golf", "catch", "boundary", "fifa", "freestyle"]

related_categories = {"football": ["soccer", "goalkeeper", "fifa"], "rugby": ["rugger"], "race": ["racing"], 
                      "cricket": ["bowl", "batsman", "batting", "wicket", "runs", "catch", "boundary", "balling"], 
                     "swimming": ["freestyle"]}

# sportwise features
category_features = ex.classify_category_features(feature_names, categories, related_categories)

# generate categories for train dataset
train_labels = ex.data_naivebayes_classify_categories(category_features, train_df)

In [49]:
# Question g. predict test data categories

# To try SGDClassifier
classifier = SGDClassifier(loss='hinge', max_iter=200)

# predict labels for test dataset
predictions = ex.train_predict_classification(classifier, features, train_labels, test_features)
test_df["predicted"] = predictions

# print statistics of prediction
ex.get_prediction_metrics_accuracy(test_df["category"], predictions)

Accuracy: 0.63
Precision: 0.67
Recall: 0.63
F1 Score: 0.63


In [50]:
# Question h. Trying different scikit-learn algorithms to estimate predictability

# To try MultinomialNB
classifier = MultinomialNB()
predictions = ex.train_predict_classification(classifier, features, train_labels, test_features)
print("Multinomial NB Performance\n----------------------------")
ex.get_prediction_metrics_accuracy(test_df["category"], predictions)

# Using SVM
classifier = svm.SVC()
predictions = ex.train_predict_classification(classifier, features, train_labels, test_features)
print("\nSVM SVC Performance\n----------------------------")
ex.get_prediction_metrics_accuracy(test_df["category"], predictions)

# KNN classifier
classifier = KNeighborsClassifier(n_neighbors=3)
predictions = ex.train_predict_classification(classifier, features, train_labels, test_features)
print("\nKNN Classifier Performance\n----------------------------")
ex.get_prediction_metrics_accuracy(test_df["category"], predictions)

Multinomial NB Performance
----------------------------
Accuracy: 0.63
Precision: 0.71
Recall: 0.63
F1 Score: 0.61

SVM SVC Performance
----------------------------
Accuracy: 0.51
Precision: 0.69
Recall: 0.51
F1 Score: 0.43

KNN Classifier Performance
----------------------------
Accuracy: 0.51
Precision: 0.61
Recall: 0.51
F1 Score: 0.51


In [51]:
# Question i. Exploring Keras/Tensorflow library

# import all libraries
import numpy as np
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D

# counts the occurance of unique words
counter = ex.counter_word(train_df)
max_length = 3700
num_words = len(counter)

train_corpus = train_df["normalized"].values.astype('U')
test_corpus = test_df["normalized"].values.astype('U')

# create a tokenizer and generate word index
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_corpus)
word_index = tokenizer.word_index

# build a sequence and padding for train data
train_sequences = tokenizer.texts_to_sequences(train_corpus)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")

# build a sequence and padding for test data
test_sequences = tokenizer.texts_to_sequences(test_corpus)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding="post", truncating="post")

# prepare categorical labels for train and test data
train_y = pd.get_dummies(train_labels).values
test_y = pd.get_dummies(test_df["category"]).values

# build the model
model = Sequential()
model.add(Embedding(num_words, 32, input_length=max_length))
model.add(SpatialDropout1D(0.1))
model.add(LSTM(64, dropout=0.1, recurrent_dropout=0.1))
model.add(Dense(17, activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

history = model.fit(train_padded, train_y, epochs=5, validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [54]:
test_matrix = model.evaluate(test_padded,test_y)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(test_matrix[0], test_matrix[1]))

Test set
  Loss: 1.992
  Accuracy: 0.405
