In [1]:
!pip install tweepy



#Fetching tweets using Twitter API


In [None]:
# Python Script to Extract tweets of a
# particular Hashtag using Tweepy and Pandas

# import modules
import pandas as pd
import tweepy

# function to display data of each tweet
def printtweetdata(n, ith_tweet):
		print()
		print(f"Tweet {n}:")
		print(f"Username:{ith_tweet[0]}")
		print(f"Description:{ith_tweet[1]}")
		print(f"Location:{ith_tweet[2]}")
		print(f"Following Count:{ith_tweet[3]}")
		print(f"Follower Count:{ith_tweet[4]}")
		print(f"Total Tweets:{ith_tweet[5]}")
		print(f"Retweet Count:{ith_tweet[6]}")
		print(f"Tweet Text:{ith_tweet[7]}")
		print(f"Hashtags Used:{ith_tweet[8]}")
		print(f"Languages Used:{ith_tweet[9]}")


# function to perform data extraction
def scrape(words, date_since, numtweet):

		# Creating DataFrame using pandas
		db = pd.DataFrame(columns=['username',
								'description',
								'location',
								'following',
								'followers',
								'totaltweets',
								'retweetcount',
								'text',
								'hashtags',
								'lang'])

		# We are using .Cursor() to search
		# through twitter for the required tweets.
		# The number of tweets can be
		# restricted using .items(number of tweets)
		tweets = tweepy.Cursor(api.search,
							words,
							since_id=date_since,
							tweet_mode='extended').items(numtweet)


		# .Cursor() returns an iterable object. Each item in
		# the iterator has various attributes
		# that you can access to
		# get information about each tweet
		list_tweets = [tweet for tweet in tweets]

		# Counter to maintain Tweet Count
		i = 1

		# we will iterate over each tweet in the
		# list for extracting information about each tweet
		for tweet in list_tweets:
				username = tweet.user.screen_name
				description = tweet.user.description
				location = tweet.user.location
				following = tweet.user.friends_count
				followers = tweet.user.followers_count
				totaltweets = tweet.user.statuses_count
				retweetcount = tweet.retweet_count
				hashtags = tweet.entities['hashtags']
				language = tweet.lang

				# Retweets can be distinguished by
				# a retweeted_status attribute,
				# in case it is an invalid reference,
				# except block will be executed
				try:
						text = tweet.retweeted_status.full_text
				except AttributeError:
						text = tweet.full_text
				hashtext = list()
				for j in range(0, len(hashtags)):
						hashtext.append(hashtags[j]['text'])

				# Here we are appending all the
				# extracted information in the DataFrame
				ith_tweet = [username, description,
							location, following,
							followers, totaltweets,
							retweetcount, text, hashtext, language]
				db.loc[len(db)] = ith_tweet
                

				# Function call to print tweet data on screen
				printtweetdata(i, ith_tweet)
				i = i+1
		filename = 'scraped_tweets.csv'

		# we will save our database as a CSV file.
		db.to_csv(filename)
        
#return scrape[words, date_since, numtweet].head(10)

if __name__ == '__main__':

		# Enter your own credentials obtained
		# from your developer account
		consumer_key = " "
		consumer_secret = " "
		access_key = " "
		access_secret = " "


		auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
		auth.set_access_token(access_key, access_secret)
		api = tweepy.API(auth, wait_on_rate_limit=True)

		# Enter Hashtag and initial date
		print("Enter Twitter HashTag to search for")
		words = "UkraineRussiaWar"
		print("Enter Date since The Tweets are required in yyyy-mm--dd")
		date_since = "2022-03-26"
		# number of tweets you want to extract in one run
		numtweet = 10000000
		scrape(words, date_since, numtweet)
		print('Scraping has completed!')

#Importing Libraries, Modules, Utilities, Plotting, NLTK, Sklearn

In [None]:
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from functools import reduce
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud 
import pandas as pd
import re
import string
from pyspark.sql import functions as F
from sklearn.model_selection import train_test_split
# utilities
import re
import pickle
import numpy as np
import pandas as pd

# plotting
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.figure_factory as ff

# nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


# sklearn
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

#Starting Spark Session

In [None]:
spark = (SparkSession.builder.master("local[*]")
                     .config("spark.driver.memory", '1g')
          .config('spark.network.timeout','3601s')
          .config('spark.executor.heartbeatInterval','3600s')
                     .appName("Sentiment Analysis")
                     .getOrCreate())

#Reading Fetched Tweets File

In [None]:
df  = spark.read.format("csv").option("delimiter",",").option("multiline",True).option("header",True).option("inferSchema",True).option('ignoreTrailingWhiteSpace',True).option("unescapedQuoteHandling","STOP_AT_CLOSING_QUOTE").load("/content/scraped_tweets.csv")

#Data Cleaning/ Filtering

In [None]:
df_en = df.filter(df.lang == "en")

In [None]:
df_en.show()

In [None]:
df2 = df_en.drop("description","following","followers","totaltweets", "retweetcount", "hashtags")
df2.printSchema()

#Data Preprocessing
 ## Removing Links
 ## Removing Numeric Values
 ## Removing Emoticons
 ## Removing Symbols
 ## Removing Blank Spaces
 

In [None]:
def preprocessing(sparkDF,col):
    sparkDF = sparkDF.withColumn(col, F.regexp_replace(col, r'http\S+', ''))
    sparkDF = sparkDF.withColumn(col, F.regexp_replace(col, r'@\w+', ''))
    sparkDF = sparkDF.withColumn(col, F.regexp_replace(col, r'#\w+', ''))
    sparkDF = sparkDF.withColumn(col, F.regexp_replace(col, r'RT', ''))
    sparkDF = sparkDF.withColumn(col, F.regexp_replace(col, r':', ''))
    sparkDF = sparkDF.withColumn(col, F.regexp_replace(col, r'[^A-Za-z0-9]+', ' '))
    sparkDF = sparkDF.withColumn(col, F.regexp_replace(col, r'[0-9]+', ''))
    sparkDF = sparkDF.withColumn(col, F.regexp_replace(col, r'\-', ''))
    sparkDF = sparkDF.withColumn(col, F.regexp_replace(col, r'[ ]+', ' '))
    sparkDF = sparkDF.withColumn(col, F.trim(sparkDF[col]))

    return sparkDF

In [None]:
df2 = preprocessing(df2,'text')

In [None]:
#preprocessed Tweets
df2.show(20)

#Fetch column: “text” because we need only that column for extracting sentiments from users and for that we need to convert our data frame into RDD(best suited for processing unstructured data). 

In [None]:
Tweets_rdd = df2.select("text").rdd.flatMap(lambda x: x)

In [None]:
header = Tweets_rdd.first()
data_rmv_col = Tweets_rdd.filter(lambda row: row != header)

lowerCase_sentRDD = data_rmv_col.map(lambda x : x.lower())

#Now split each sentence into words, also called word tokenization.

In [None]:
def sent_TokenizeFunct(x):
    return nltk.sent_tokenize(x)
sentenceTokenizeRDD = lowerCase_sentRDD.map(sent_TokenizeFunct)

#Now split each sentence into words, also called word tokenization.

In [None]:
def word_TokenizeFunct(x):
    splitted = [word for line in x for word in line.split()]
    return splitted
wordTokenizeRDD = sentenceTokenizeRDD.map(word_TokenizeFunct)
wordTokenizeRDD.take(10)

#To move ahead first we will clean our data, here we’re gonna remove stopwords, punctuations, and empty spaces.

In [None]:
def removeStopWordsFunct(x):
    from nltk.corpus import stopwords
    stop_words=set(stopwords.words('english'))
    filteredSentence = [w for w in x if not w in stop_words]
    return filteredSentence
stopwordRDD = wordTokenizeRDD.map(removeStopWordsFunct)
def removePunctuationsFunct(x):
    list_punct=list(string.punctuation)
    filtered = [''.join(c for c in s if c not in list_punct) for s in x] 
    filtered_space = [s for s in filtered if s] #remove empty space 
    return filtered
rmvPunctRDD = stopwordRDD.map(removePunctuationsFunct)

rmvPunctRDD.take(10)

#Stemming and Lemmatization are the basic text processing methods for English text. The goal of both of them is to reduce inflectional forms and sometimes derivationally related forms of a word to a common base form. I have skipped Stemming because it is not an efficient method as sometimes it produces words that are not even close to the actual word.

In [None]:
def lemmatizationFunct(x):
    nltk.download('wordnet')
    lemmatizer = WordNetLemmatizer()
    finalLem = [lemmatizer.lemmatize(s) for s in x]
    return finalLem
lem_wordsRDD = rmvPunctRDD.map(lemmatizationFunct)

lem_wordsRDD.take(10)

#Our next task is a little tricky, we have to extract keyphrases(also called Noun phrases). So first we need to join “lem_wordsRDD” tokens.

In [None]:
def joinTokensFunct(x):
    joinedTokens_list = []
    x = " ".join(x)
    return [x]
joinedTokens = lem_wordsRDD.map(joinTokensFunct)
joinedTokens.take(10)

#From the above step we roughly got all the key phrases the users are talking about. Now categorize these key phrases into Positive, Negative, or Neutral.

In [None]:
def sentimentWordsFunct(x):
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    analyzer = SentimentIntensityAnalyzer() 
    senti_list_temp = [] 
    for i in x:
        y = ''.join(i) 
        vs = analyzer.polarity_scores(y)
        senti_list_temp.append((y, vs))
        senti_list_temp = [w for w in senti_list_temp if w]    
    sentiment_list  = []
    for j in senti_list_temp:
        first = j[0]
        second = j[1]
    
        for (k,v) in second.items():
            if k == 'compound':
                if v < 0.0:
                    sentiment_list.append((first, "Negative"))
                elif v == 0.0:
                    sentiment_list.append((first, "Neutral"))
                else:
                    sentiment_list.append((first, "Positive"))
    return sentiment_list

sentimentRDD = joinedTokens.flatMap(sentimentWordsFunct)

In [None]:
sentimentRDD.take(10)

In [None]:
data = sentimentRDD.toDF()
data.printSchema()

#Sentiment Analyzed

In [None]:
data.show(10)

In [None]:
print((data.count(), len(data.columns)))

In [None]:
data.createOrReplaceTempView("sentiments")

In [None]:
spark.sql("select _2 as Sentiments, count(_2) as Counts from sentiments group by _2").show()

In [None]:
data.toPandas().to_csv("tweet_df.csv")

In [None]:
tweets_df = pd.read_csv("/content/tweet_df.csv")

#Data Visualization

In [None]:
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.figure_factory as ff

In [None]:
plt.figure(figsize = (10,7))
sns.countplot(x="_2", data=tweets_df, palette='viridis')

In [None]:
tweet_neg = tweets_df.loc[tweets_df['_2'] == 'Negative'].reset_index(drop=True)
tweet_net = tweets_df.loc[tweets_df['_2'] == 'Neutral'].reset_index(drop=True)
tweet_pos = tweets_df.loc[tweets_df['_2'] == 'Positive'].reset_index(drop=True)

#EDA Visualization for Negative Words

In [None]:
stopwords_set = set(STOPWORDS)
wordcloud = WordCloud(background_color='black',
                     stopwords = stopwords_set,
                      max_words = 300,
                      max_font_size = 40,
                      scale = 2,
                      random_state=42
                     ).generate(str(tweet_neg['_1']))

print(wordcloud)
plt.figure(figsize = (10,7))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

#EDA Visualization for Neutral Words

In [None]:
stopwords_set = set(STOPWORDS)
wordcloud = WordCloud(background_color='black',
                     stopwords = stopwords_set,
                      max_words = 300,
                      max_font_size = 40,
                      scale = 2,
                      random_state=42
                     ).generate(str(tweet_net['_1']))

print(wordcloud)
plt.figure(figsize = (10,7))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

#EDA Visualization for Positive Words

In [None]:
stopwords_set = set(STOPWORDS)
wordcloud = WordCloud(background_color='black',
                     stopwords = stopwords_set,
                      max_words = 300,
                      max_font_size = 40,
                      scale = 2,
                      random_state=42
                     ).generate(str(tweet_pos['_1']))

print(wordcloud)
plt.figure(figsize = (10,7))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
tweets_df['_1'] = tweets_df['_1'].astype('str')

In [None]:
tweets_df['_2'] = tweets_df['_2'].astype('str')

In [None]:
processed_txt = tweets_df['_1']
sentiments = tweets_df['_2']

#Splitting the Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(processed_txt, sentiments,
                                                    test_size = 0.3, random_state = 0)
print(f'Data Split done.')

#Vectorization 

In [None]:
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print(f'Vectoriser fitted.')
print('No. of feature_words: ', len(vectoriser.get_feature_names()))

#Data Transformation

In [None]:
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)
print(f'Data Transformed.')

#Creating Data Model For Training & Testing Data

In [None]:
LRmodel = LogisticRegression(solver='liblinear', random_state=0)
LRmodel.fit(X_train, y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
LRmodel = LogisticRegression(solver='liblinear', random_state=0).fit(X_test, y_test)

In [None]:
LRmodel.predict_proba(X_test)

#Data Model Accuracy

In [None]:
print('Classification Report: \n',classification_report(y_test, LRmodel.predict(X_test)))
print('Confusion Matrix: \n',metrics.confusion_matrix(y_test, LRmodel.predict(X_test)))