In [73]:
import pandas as pd

In [158]:
# reading data in csv file using pandasb
df = pd.read_csv('filtered_tweets.csv')

In [61]:
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
1,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
2,0,1467813579,Mon Apr 06 22:20:31 PDT 2009,NO_QUERY,starkissed,@LettyA ahh ive always wanted to see rent lov...
3,0,1467818603,Mon Apr 06 22:21:49 PDT 2009,NO_QUERY,kennypham,"Sad, sad, sad. I don't know why but I hate thi..."
4,0,1467819650,Mon Apr 06 22:22:05 PDT 2009,NO_QUERY,antzpantz,@Viennah Yay! I'm happy for you with your job!...


In [37]:
# Finding out the top tweeters 
number_tweets_df = df.groupby('user').text.count().reset_index()
number_tweets_df.columns = ['user', 'number of tweets']

number_tweets_df.sort_values(by=['number of tweets'], ascending=False, inplace=True)
number_tweets_df.head()

Unnamed: 0,user,number of tweets
4129,lost_dog,549
5530,webwoke,345
5424,tweetpet,310
1767,SallytheShizzle,281
2204,VioletsCRUK,279


In [178]:
# Remove mentions, emails and links
def removeAt(tweet):
    words = tweet.split(' ')
    return ' '.join(filter(lambda x: x and '@' not in x, words))

def removeLinks(tweet):
    words = tweet.split(' ')
    return ' '.join(filter(lambda x: not ('http://' in x.lower() or 'https://' in x.lower()), words))

In [179]:
# Apply formulas 
df.text = df.text.apply(lambda x: removeAt(removeLinks(x.strip())))

In [188]:
# Array of users that we removed (due to duplicates)
duplicated = df.duplicated(subset='text', keep=False)
pd.unique(df['user'])

array(['12gaBrowningGal', '15Stepz', '16_MileyCyrus', ..., 'zubinsaxena',
       'zuppalizzle', 'zzzValzzz'], dtype=object)

In [185]:
# Cleaned dataframe 
df.drop_duplicates(subset='text', keep=False, inplace=True)

# sort dataframe by user 
df.sort_values(by='user', inplace=True)
df.head()

Unnamed: 0,target,id,date,flag,user,text
14524,0,1835705341,Mon May 18 06:34:45 PDT 2009,NO_QUERY,12gaBrowningGal,Blind. Definitely. They aren't the smartest bi...
68041,0,2211076285,Wed Jun 17 12:24:07 PDT 2009,NO_QUERY,12gaBrowningGal,"I'm killing me too! I'm hungry now- Pie, Outdo..."
85046,0,2299986642,Tue Jun 23 13:35:33 PDT 2009,NO_QUERY,12gaBrowningGal,My skeet game bites! I am not joking. I will h...
203184,4,2071364168,Sun Jun 07 19:16:31 PDT 2009,NO_QUERY,12gaBrowningGal,That's just because she trying to figure out w...
207475,4,2178187758,Mon Jun 15 07:19:40 PDT 2009,NO_QUERY,12gaBrowningGal,I am really not skilled enough to shoot trap. ...


In [228]:
# Create a dictionary consolidating users and tweets to send to LIWC 
user_list = df['user'].tolist()
tweets_list = df['text'].tolist()

user_tweet_dic = {}

for i in range(len(user_list)):
    user = user_list[i]
    tweet = tweets_list[i]
    
    if user not in user_tweet_dic:
        user_tweet_dic[user] = []
        user_tweet_dic[user].append(tweet)
    else:
        user_tweet_dic[user].append(tweet)
        
new_user_list = []
new_combined_tweets_list = []

for user, tweets in user_tweet_dic.items():
    combined_tweets = ' '.join(tweets)
    if len(combined_tweets.split(' ')) >= 100:
        new_user_list.append(user)
        new_combined_tweets_list.append(combined_tweets)
    
consolidated_tweets_df = pd.DataFrame({'user': new_user_list, 'tweets': new_combined_tweets_list})
consolidated_tweets_df.to_csv('consolidated_tweets_df.csv')

In [230]:
# Creating a dataframe for the first 500 users 
first_500_users = consolidated_tweets_df['user'].unique()[:500]
first_500_users_df = df[df['user'].isin(first_500_users)]

# Creating a dictionary of user and their tweets 
user_tweet_dictionary = {}

# Create a format that follows IBM Watson's format and saving it to the user_tweet_dictionary
for index, row in first_500_users_df.iterrows():
    user = row['user']
    text = row['text']
    
    tweet = {}
    
    tweet['content'] = text
    tweet['contenttype'] = 'text/plain'
    tweet['language'] = 'en'
    
    if user not in user_tweet_dictionary:
        user_tweet_dictionary[user] = {'contentItems': []}
        user_tweet_dictionary[user]['contentItems'].append(tweet)
    else:
        user_tweet_dictionary[user]['contentItems'].append(tweet)

In [231]:
import json
from ibm_watson import PersonalityInsightsV3
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

#API KEY provided on the service page 
KEY = 'STximBrq6oOabT5yZteafZcbt3m6Xh6SUSVuWr3RW3V6'

# Authentication via IBM's IAM (Identity and Access Management)
authenticator = IAMAuthenticator(KEY)

# Creating a service instance
service = PersonalityInsightsV3(
    version='2017-10-13',
    authenticator=authenticator)

# Setting service endpoint 
service.set_service_url('https://gateway.watsonplatform.net/personality-insights/api')

In [232]:
# Creating a dictionary to store results from IBM Watson
results = {'User': [],
          'Openness': [],
          'Conscientiousness': [],
          'Extraversion': [],
          'Agreeableness': [],
          'Emotional range': []}

# creates profiles of users in user_tweet_dictionary and saves them to results
for user, tweets in user_tweet_dictionary.items():
    profile = service.profile(tweets, 'application/json', raw_scores=True, consumption_preferences=True).get_result()
    
    results['User'].append(user)
    results['Openness'].append(profile['personality'][0]['raw_score'])
    results['Conscientiousness'].append(profile['personality'][1]['raw_score'])
    results['Extraversion'].append(profile['personality'][2]['raw_score'])
    results['Agreeableness'].append(profile['personality'][3]['raw_score'])
    results['Emotional range'].append(profile['personality'][4]['raw_score'])

In [235]:
# Create a dataframe from results dictionary
big5_df = pd.DataFrame(results)
big5_df.head(10)

Unnamed: 0,User,Openness,Conscientiousness,Extraversion,Agreeableness,Emotional range
0,12gaBrowningGal,0.764999,0.656144,0.548956,0.788804,0.439579
1,15Stepz,0.748124,0.690716,0.564859,0.82393,0.41077
2,16_MileyCyrus,0.672814,0.637773,0.525612,0.811025,0.563704
3,18percentgrey,0.789412,0.607883,0.528168,0.772518,0.461528
4,19fischi75,0.706904,0.627109,0.447471,0.803232,0.518279
5,1ChazD,0.718638,0.681279,0.571098,0.788971,0.484606
6,1azylizzie,0.714031,0.553882,0.468429,0.742733,0.656068
7,1flyharmony,0.702556,0.624573,0.535226,0.79688,0.588853
8,24cotton,0.755732,0.632184,0.594373,0.817001,0.56729
9,2NiteBoy,0.693501,0.625447,0.531674,0.764474,0.432516


In [248]:
LIWC_df.head()

Unnamed: 0,User,WC,Analytic,Clout,Authentic,Tone,WPS,Sixltr,Dic,function,...,Comma,Colon,SemiC,QMark,Exclam,Dash,Quote,Apostro,Parenth,OtherP
1,12gaBrowningGal,388,27.36,34.94,68.96,93.82,5.97,10.05,87.89,51.29,...,2.58,0.0,0.0,0.52,12.37,0.52,0.0,3.61,0.0,0.0
2,15Stepz,254,26.07,70.94,84.8,99.0,31.75,11.81,93.7,49.61,...,9.45,0.39,0.0,1.18,0.39,1.57,0.0,4.33,1.57,0.0
3,16_MileyCyrus,1164,21.04,64.42,58.8,99.0,11.09,8.68,86.6,52.32,...,3.26,0.52,0.0,2.15,13.57,0.43,0.0,1.37,0.26,1.03
4,18percentgrey,931,47.25,53.87,58.83,99.0,8.1,15.15,81.1,47.26,...,4.83,0.64,0.75,1.61,2.26,1.93,0.0,3.22,1.29,2.26
5,19fischi75,1707,48.43,48.83,63.95,88.95,37.93,9.67,83.36,47.63,...,0.7,0.23,0.0,3.1,1.64,3.87,0.0,0.0,2.58,3.81


In [246]:
LIWC_df = pd.read_csv('LIWC2015 Results (consolidated_tweets_df.csv).csv')
LIWC_df.drop(0, inplace=True)
LIWC_df.drop(columns=['A', 'C'], inplace=True)
LIWC_df.rename(columns={'B': 'User'}, inplace=True)

updated_df = pd.merge(big5_df, LIWC_df)
updated_df.head()

Unnamed: 0,User,Openness,Conscientiousness,Extraversion,Agreeableness,Emotional range,WC,Analytic,Clout,Authentic,...,Comma,Colon,SemiC,QMark,Exclam,Dash,Quote,Apostro,Parenth,OtherP
0,12gaBrowningGal,0.764999,0.656144,0.548956,0.788804,0.439579,388,27.36,34.94,68.96,...,2.58,0.0,0.0,0.52,12.37,0.52,0.0,3.61,0.0,0.0
1,15Stepz,0.748124,0.690716,0.564859,0.82393,0.41077,254,26.07,70.94,84.8,...,9.45,0.39,0.0,1.18,0.39,1.57,0.0,4.33,1.57,0.0
2,16_MileyCyrus,0.672814,0.637773,0.525612,0.811025,0.563704,1164,21.04,64.42,58.8,...,3.26,0.52,0.0,2.15,13.57,0.43,0.0,1.37,0.26,1.03
3,18percentgrey,0.789412,0.607883,0.528168,0.772518,0.461528,931,47.25,53.87,58.83,...,4.83,0.64,0.75,1.61,2.26,1.93,0.0,3.22,1.29,2.26
4,19fischi75,0.706904,0.627109,0.447471,0.803232,0.518279,1707,48.43,48.83,63.95,...,0.7,0.23,0.0,3.1,1.64,3.87,0.0,0.0,2.58,3.81
