## AML-3204 Social Media Analytics 

## Project Title: Collaborative Filtering-based vs Hybrid Recommender System

### Group Members

- SAI VARUN KOLLIPARA - C0828403
- PRAMOD REDDY GURRALA – C0850493
- DEEKSHA NAIKAP – C0835440
- BHANU PRAKASH MAHADEVUNI – C0850515

### Scraping Function

In [2]:
import requests
import sys
import os
import json
import datetime
import pandas as pd
import snscrape.modules.twitter as sntwitter
import itertools
import codecs
import re
# path is the folder were the csv files will be stored
path = 'tweets'

class Scrapper:
    def __init__(self, keyword, dates = [], fixed_query = '', tweets_to_obtain_per_day = 100):
        self.keyword = keyword
        self.dates = dates
        self.fixed_query = fixed_query
        self.batches = int(tweets_to_obtain_per_day / 100)
        # Creates the dataframe for the keyword
        self.dataframe = pd.DataFrame()
    
    def file_name(self):
        clean_keyword = self.keyword.replace('@', '').replace('#', '').replace('/','')
        return f'{path}/tweets_{clean_keyword}.csv'

    def check_file_existence(self):
        return os.path.exists(self.file_name())
        
    def scrap_tweets(self, from_date):
        to_date = from_date + datetime.timedelta(days = 1)
        query = f'{self.fixed_query} {self.keyword} since:{from_date.strftime("%Y-%m-%d")} until:{to_date.strftime("%Y-%m-%d")}'
        result = sntwitter.TwitterSearchScraper(query) 
        df = pd.DataFrame(itertools.islice(result.get_items(), 50))
        if len(df) > 0:
            df['keyword'] = self.keyword
            #self.dataframe = self.dataframe.append(df[['id', 'url', 'date', 'content', 'keyword']], ignore_index=True)
            self.dataframe = pd.concat([self.dataframe, df[['id', 'url', 'date', 'content', 'keyword']]])
        
    def get_tweets(self):
        for from_date in self.dates:
            self.scrap_tweets(from_date)           
       
    def save(self):
        print(f'Saving {self.file_name()}')
        self.dataframe.to_csv(self.file_name())
        
    def load(self):
        try:
            print(f'Loading {self.file_name()}')
            self.dataframe = pd.read_csv(self.file_name(),engine='python' )
        except pd.errors.EmptyDataError:
            print(f'Empty {self.file_name()}')

### Cleaner Function

In [3]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from nltk.stem.porter import PorterStemmer
import numpy as np
porter = PorterStemmer()
words = set(nltk.corpus.words.words())
cachedStopWords = stopwords.words("english")

class Cleaner:
    def __init__(self, df, features):
        self.result = df
        self.features = features
    
    def get_result(self):
        return self.result
    
    def drop_duplicates(self):
        df = self.result
        #df.set_index('id', inplace=True)
        df["content"].fillna("0", inplace = True)
        #df = df.dropna()
        df = df[~df.index.duplicated(keep='first')]
        df['index'] = np.arange(len(df))
        df.set_index('index', inplace=True)
        self.result = df[self.features]

    def clean_data(self, text):
        
        #Remove emojis
        emoji_pattern = re.compile("["
                                    u"\U0001F600-\U0001F64F"
                                   u"\U0001F900-\U0001F9FF"
                                   u"\U0001F000-\U0002FFFF"
                                   u"\U0001F300-\U0001F5FF"
                                   u"\U0001F680-\U0001F6FF"
                                   u"\U0001F1E0-\U0001F1FF"
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r' ', text)
        
        #Remove all URLs
        text = re.sub(r'http\S+', '', text)
        #Remove numbers
        text = re.sub(r'[0-9]+', ' ', text)
        #Remove all words with a lenght less than 3
        text = re.sub(r'\b\w{1,2}\b', ' ', text)
        #Remove everything that is not a word or a space
        text = re.sub(r'[^\w\s]',' ',text)
        #Replace underscore with space
        #text = re.sub(r'[_]',' ',text)
        #Remove punctuation
        text = "".join([word.lower() for word in text if word not in string.punctuation])
        #Tokenize
        tokens = word_tokenize(text)
        #Remove stop-words
        text = [word for word in tokens if word not in cachedStopWords]
        return text
   
    def ex_hashtags(self, text):
        return list(part[1:] for part in text.split() if part.startswith('#'))
    
    def extract_hashtags(self):
        self.result['hashtags']=self.result['content'].apply(lambda x: self.ex_hashtags(x))
    
    def ex_usernames(self, text):
        return list(part[1:] for part in text.split() if part.startswith('@'))
    
    def extract_usernames(self):
        self.result['usernames']=self.result['content'].apply(lambda x: self.ex_usernames(x))
    
    def extract_content(self):
        # create a column with the cleaned tokens
        self.result['content_clean'] = self.result['content'].apply(lambda x: self.clean_data(x))
        # Drop rows with content_clean lenght < 1 (i.e, where there is less than 1 token)
        self.result = self.result[self.result['content_clean'].map(len) > 1]  

### Sentimental Analysis on the Tweets

In [4]:
# The following class summarizes the functions to calculate de sentiment with vader
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
class SentimentAnalyzer:
    def __init__(self, df):
        self.result = df
    def get_simple_sentiment_score(self,sentence):
        return analyzer.polarity_scores(sentence)
    def get_sentiments(self):
        self.result['sentiment_score'] = self.result['content'].apply(lambda t: self.get_simple_sentiment_score(t)['compound'])
        self.result['sentiment'] = self.result['sentiment_score'].apply(lambda s: 'positive' if s > 0.05 else ('negative' if s < -0.05 else 'neutral'))
    def get_result(self):        
        return self.result
    def mean_sentiment_by_keyword(self):
        self.result = self.result.groupby('keyword', as_index=False)['sentiment_score'].mean()

