**Problem Statement**
This notebook would be used to carry out sentiment analysis on the tweets about organisations. The data would also capture the number of followers and following in numeric datatype. The output from the analysis would be presented as a dataframe containing the list of companies as rows and the average polarity , average subjectivity, number of followers and number of followings as the columns.

**Data**
The data for each company is available as csv and it will be preprocessed accordlingly for NLP processing.

**Tools**
The python libraries to be used are Pandas & TextBlob.


In [1]:
#!pip install TextBlob

In [2]:
# Data Manipulation and cleaning Libraries
import numpy as np
import re
import pandas as pd

# File system libraries
from os import listdir
from os.path import isfile, join

# NLP processing libraries
from textblob import TextBlob

In [3]:
# Add the file names to a list
files = [f for f in listdir("C:\\Users\\VG\OneDrive - Cardiff Metropolitan University\\Document Management\\Dissertation\\Crunchbase Data\\tweets") if isfile(join("C:\\Users\\VG\\OneDrive - Cardiff Metropolitan University\\Document Management\\Dissertation\\Crunchbase Data\\tweets", f))]

In [4]:
# Peep at the list 
files[:4]

['.csv', '1NorthEnlight.csv', '23andMe.csv', '4INFO.csv']

In [5]:
# Read the all the dataset into a dictionary of DataFrames
data = {}

for i in range(0,len(files)+1):
    try:
        df = pd.read_csv("tweets/"+files[i],usecols=['account','text','follower_data','following_data'])
        data[f'{files[i].split(".")[0]}'] = df
    except:
        pass

In [6]:
# Helper function clean tweet text
def clean_tweet(tweet):
    """
    Parameters:
    ----------
    Tweet: String type of the tweet content
    
    Returns:
    ---------
    cleaned version of the string tweet
    
    """
    tweet =tweet.lower()# Lowercasing all the letters
    tweet= re.sub("@[A-Za-z0-9_]+","", tweet) # Remove all the mentions
    tweet = re.sub("#[A-Za-z0-9_]+","", tweet) # Remove all the hashtags
    tweet = re.sub(r"http\S+", "", tweet) # Remove the URL
    tweet = re.sub(r"www.\S+", "", tweet) # Remove the URL
    tweet= re.sub('[()!?]', ' ', tweet) # Remove punctuations
    tweet = re.sub('\[.*?\]',' ', tweet) # Remove punctuations
    tweet= re.sub("[^a-z0-9]"," ", tweet) # Remove all non-alphanumeric characters

    return tweet

# Helper function clean tweet in all DataFrames
def clean_dict(data):
    """
    Parameters:
    ----------
    data: Dictionary of the DataFrames with tweets to clean
    
    Returns:
    ---------
    Dictionary of DataFrame with clean tweets
    
    """

    for key, val in data.items():
        val.dropna(subset=['text'],inplace=True)# Drop NAs
        data[key]=val # Update dictionary

        new_string = [] # create list to store cleaned tweets
        for tweet in val['text']:
            tweet = clean_tweet(tweet) # Clean tweets
            new_string.append(tweet)
        
        val['text']=new_string # update the dataframe with clean tweet
        data[key]=val
    return  data 
        

In [7]:
# Clean the tweet datasets and save in a dictionary
data_c = clean_dict(data)

In [8]:
# Calculate and add the polarity and subjectivity columns
for key, val in data_c.items():
    for item in val['follower_data']:
        if str(item).find('K') != -1:
            item = re.sub("[^0-9.]","", item) # format the K to '000'
            item = float(item)*1000
#             print(item)
            val['follower_data'] = item
            data_c[key]=val
        elif str(item).find('M') != -1:
            item = re.sub("[^0-9.]","", item) # format the M to '000000'
            item = float(item)*1000000
#             print(item)
            val['follower_data'] = item
            data_c[key]=val
    try:        
        val['polarity']=val['text'].apply(lambda x:TextBlob(x).sentiment.polarity)# Calculates and add polarity columns
        val['subjectivity']=val['text'].apply(lambda x:TextBlob(x).sentiment.subjectivity)      
        data_c[key]=val # Update dictionary
    except:
        pass
        


In [9]:
# Check the output
data_c

{'':                                  account  \
 0   https://twitter.com/jellyfishglobal/   
 3   https://twitter.com/jellyfishglobal/   
 5   https://twitter.com/jellyfishglobal/   
 6   https://twitter.com/jellyfishglobal/   
 8   https://twitter.com/jellyfishglobal/   
 9   https://twitter.com/jellyfishglobal/   
 11  https://twitter.com/jellyfishglobal/   
 12  https://twitter.com/jellyfishglobal/   
 15  https://twitter.com/jellyfishglobal/   
 16  https://twitter.com/jellyfishglobal/   
 17  https://twitter.com/jellyfishglobal/   
 18  https://twitter.com/jellyfishglobal/   
 19  https://twitter.com/jellyfishglobal/   
 23  https://twitter.com/jellyfishglobal/   
 25  https://twitter.com/jellyfishglobal/   
 26  https://twitter.com/jellyfishglobal/   
 
                                                  text follower_data  \
 0    if use of the platform is restricted as a res...         6,811   
 3   is our industry s obsession with roas stunting...         6,811   
 5   how can 

In [10]:
# Create single dataframe for all companies

company = [] 
account = []
followers = []
following = []
polarity = []
subjectivity = []

for key, val in data_c.items():
    for item in val['follower_data']:
        followers.append(item)
        break
    for item in val['following_data']:
        following.append(item)
        break
    for item in val['account']:
        account.append(item)
        break
    company.append(key)
    polarity.append(val['polarity'].mean())
    subjectivity.append(val['subjectivity'].mean())

enrich_df = pd.DataFrame(list(zip(company, following, followers, polarity, subjectivity, account)),
                         columns=  ['company_name','following','followers','polarity',
                         'subjectivity','account'])


In [11]:
# View the final dataset
enrich_df

Unnamed: 0,company_name,following,followers,polarity,subjectivity,account
0,,2012,6811,0.036111,0.184722,https://twitter.com/jellyfishglobal/
1,1NorthEnlight,7,5,0.441667,0.705556,http://twitter.com/1NorthEnlight
2,23andMe,6814,92900.0,0.094235,0.351383,http://twitter.com/23andMe
3,4INFO,2061,2109,0.375000,0.750000,http://twitter.com/4INFO
4,A123Systems,185,1844,0.012121,0.284848,http://twitter.com/A123Systems
...,...,...,...,...,...,...
322,zappos,858,60700.0,0.086145,0.263922,http://twitter.com/Zazzle
323,Zazzle,4533,34300.0,0.170833,0.344792,http://twitter.com/Zello
324,Zello,2716,10500.0,-0.012808,0.255536,http://twitter.com/zimbra
325,zimbra,112,91800.0,0.115857,0.330372,http://twitter.com/zoho


In [12]:
# Save the dataset
enrich_df.to_csv('enrichdata.csv',index=False)