In [164]:
import pandas as pd

In [165]:
# reading data in csv file using pandas
df = pd.read_csv('filtered_tweets.csv')

In [166]:
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
1,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
2,0,1467813579,Mon Apr 06 22:20:31 PDT 2009,NO_QUERY,starkissed,@LettyA ahh ive always wanted to see rent lov...
3,0,1467818603,Mon Apr 06 22:21:49 PDT 2009,NO_QUERY,kennypham,"Sad, sad, sad. I don't know why but I hate thi..."
4,0,1467819650,Mon Apr 06 22:22:05 PDT 2009,NO_QUERY,antzpantz,@Viennah Yay! I'm happy for you with your job!...


In [167]:
# Remove mentions, emails and links
def removeAt(tweet):
    words = tweet.split(' ')
    return ' '.join(filter(lambda x: x and '@' not in x, words))

def removeLinks(tweet):
    words = tweet.split(' ')
    return ' '.join(filter(lambda x: not ('http://' in x.lower() or 'https://' in x.lower()), words))

In [168]:
# Apply formulas to clean data
df.text = df.text.apply(lambda x: removeAt(removeLinks(x.strip())))

In [169]:
# Cleaned dataframe 
df.drop_duplicates(subset='text', keep=False, inplace=True)

# sort dataframe by user 
df.sort_values(by='user', inplace=True)
df.head()

Unnamed: 0,target,id,date,flag,user,text
14524,0,1835705341,Mon May 18 06:34:45 PDT 2009,NO_QUERY,12gaBrowningGal,Blind. Definitely. They aren't the smartest bi...
68041,0,2211076285,Wed Jun 17 12:24:07 PDT 2009,NO_QUERY,12gaBrowningGal,"I'm killing me too! I'm hungry now- Pie, Outdo..."
85046,0,2299986642,Tue Jun 23 13:35:33 PDT 2009,NO_QUERY,12gaBrowningGal,My skeet game bites! I am not joking. I will h...
203184,4,2071364168,Sun Jun 07 19:16:31 PDT 2009,NO_QUERY,12gaBrowningGal,That's just because she trying to figure out w...
207475,4,2178187758,Mon Jun 15 07:19:40 PDT 2009,NO_QUERY,12gaBrowningGal,I am really not skilled enough to shoot trap. ...


In [170]:
# Create a dictionary consolidating users and tweets to send to LIWC 
user_list = df['user'].tolist()
tweets_list = df['text'].tolist()

user_tweet_dic = {}

for i in range(len(user_list)):
    user = user_list[i]
    tweet = tweets_list[i]
    
    if user not in user_tweet_dic:
        user_tweet_dic[user] = []
        user_tweet_dic[user].append(tweet)
    else:
        user_tweet_dic[user].append(tweet)
        
new_user_list = []
new_combined_tweets_list = []

for user, tweets in user_tweet_dic.items():
    combined_tweets = ' '.join(tweets)
    if len(combined_tweets.split(' ')) >= 100:
        new_user_list.append(user)
        new_combined_tweets_list.append(combined_tweets)
    
# Create a dataframe based on dictionary and exporting as a csv file
consolidated_tweets_df = pd.DataFrame({'user': new_user_list, 'tweets': new_combined_tweets_list})
consolidated_tweets_df.to_csv('consolidated_tweets_df.csv')

<div class='alert alert-info'>
    Imagine the group taking that csv file and putting it through the LIWC software to get the output! :D
    <br>
    The output will be saved in a file called LIWC2015 Results (consolidated_tweets_df.csv).csv that will be imported later
</div>

In [171]:
# Creating a dataframe for the first 500 users 
first_500_users = consolidated_tweets_df['user'].unique()[:500]
first_500_users_df = df[df['user'].isin(first_500_users)]

# Creating a dictionary of user and their tweets 
user_tweet_dictionary = {}

# Create a format that follows IBM Watson's format and saving it to the user_tweet_dictionary
for index, row in first_500_users_df.iterrows():
    user = row['user']
    text = row['text']
    
    tweet = {}
    
    tweet['content'] = text
    tweet['contenttype'] = 'text/plain'
    tweet['language'] = 'en'
    
    if user not in user_tweet_dictionary:
        user_tweet_dictionary[user] = {'contentItems': []}
        user_tweet_dictionary[user]['contentItems'].append(tweet)
    else:
        user_tweet_dictionary[user]['contentItems'].append(tweet)

<div class='alert alert-info'>
    The bottom few cells are commented out because we are using API calls to get the output, and after a certain number of calls, we will have to pay money!
    <br>
    Instead, we have saved the file called big5_df.csv that will also be imported later
</div>

In [172]:
# import json
# from ibm_watson import PersonalityInsightsV3
# from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

# #API KEY provided on the service page 
# KEY = 'STximBrq6oOabT5yZteafZcbt3m6Xh6SUSVuWr3RW3V6'

# # Authentication via IBM's IAM (Identity and Access Management)
# authenticator = IAMAuthenticator(KEY)

# # Creating a service instance
# service = PersonalityInsightsV3(
#     version='2017-10-13',
#     authenticator=authenticator)

# # Setting service endpoint 
# service.set_service_url('https://gateway.watsonplatform.net/personality-insights/api')

In [173]:
# # Creating a dictionary to store results from IBM Watson
# results = {'User': [],
#           'Openness': [],
#           'Conscientiousness': [],
#           'Extraversion': [],
#           'Agreeableness': [],
#           'Emotional range': []}

# # creates profiles of users in user_tweet_dictionary and saves them to results
# for user, tweets in user_tweet_dictionary.items():
#     profile = service.profile(tweets, 'application/json', raw_scores=True, consumption_preferences=True).get_result()
    
#     results['User'].append(user)
#     results['Openness'].append(profile['personality'][0]['raw_score'])
#     results['Conscientiousness'].append(profile['personality'][1]['raw_score'])
#     results['Extraversion'].append(profile['personality'][2]['raw_score'])
#     results['Agreeableness'].append(profile['personality'][3]['raw_score'])
#     results['Emotional range'].append(profile['personality'][4]['raw_score'])

In [174]:
# # Create a dataframe from results dictionary and save csv to use
# big5_df = pd.DataFrame(results)
# big5_df.to_csv('big5_df.csv')

In [175]:
# Importing both the outputs of LIWC and IBM Watson (Big 5)
LIWC_df = pd.read_csv('LIWC2015 Results (consolidated_tweets_df.csv).csv')
LIWC_df.drop(0, inplace=True)
LIWC_df.drop(columns=['A', 'C'], inplace=True)
LIWC_df.rename(columns={'B': 'User'}, inplace=True)
big5_df = pd.read_csv('big5_df.csv').drop(columns=['Unnamed: 0'])

# Merge dataframes together
updated_df = pd.merge(big5_df, LIWC_df)
updated_df.head()

Unnamed: 0,User,Openness,Conscientiousness,Extraversion,Agreeableness,Emotional range,WC,Analytic,Clout,Authentic,...,Comma,Colon,SemiC,QMark,Exclam,Dash,Quote,Apostro,Parenth,OtherP
0,12gaBrowningGal,0.764999,0.656144,0.548956,0.788804,0.439579,388,27.36,34.94,68.96,...,2.58,0.0,0.0,0.52,12.37,0.52,0.0,3.61,0.0,0.0
1,15Stepz,0.748124,0.690716,0.564859,0.82393,0.41077,254,26.07,70.94,84.8,...,9.45,0.39,0.0,1.18,0.39,1.57,0.0,4.33,1.57,0.0
2,16_MileyCyrus,0.672814,0.637773,0.525612,0.811025,0.563704,1164,21.04,64.42,58.8,...,3.26,0.52,0.0,2.15,13.57,0.43,0.0,1.37,0.26,1.03
3,18percentgrey,0.789412,0.607883,0.528168,0.772518,0.461528,931,47.25,53.87,58.83,...,4.83,0.64,0.75,1.61,2.26,1.93,0.0,3.22,1.29,2.26
4,19fischi75,0.706904,0.627109,0.447471,0.803232,0.518279,1707,48.43,48.83,63.95,...,0.7,0.23,0.0,3.1,1.64,3.87,0.0,0.0,2.58,3.81


In [176]:
def normalise(number):
    return round(number*100,2)

updated_df['Openness'] = updated_df['Openness'].apply(normalise)
updated_df['Conscientiousness'] = updated_df['Conscientiousness'].apply(normalise)
updated_df['Extraversion'] = updated_df['Extraversion'].apply(normalise)
updated_df['Agreeableness'] = updated_df['Agreeableness'].apply(normalise)
updated_df['Emotional range'] = updated_df['Emotional range'].apply(normalise)

In [177]:
def readability_score(row):
    sixletter = row['Sixltr']
    wps = row['WPS'] 
    wordcount = row['WC']
    
    readability = (-1 * sixletter) + wps - wordcount + 3
    return readability 

updated_df['readability'] = updated_df.apply(readability_score, axis=1)

In [178]:
def readability_percentile(score):
    readability_range = updated_df['readability'].max() - updated_df['readability'].min()
    return -score/readability_range*100

updated_df['readability'] = updated_df['readability'].apply(readability_percentile)

In [179]:
def composite_score(row):    
    #intermediate numbers
    sixletter = row['Sixltr']
    articles = row['article']
    pasttense = row['focuspast']
    presenttense = row['focuspresent']
    firstperson = 100 - row['i'] #inverse of first person singular pronoun
    discrepancy = row['discrep']
    wps = row['WPS'] #words per sentence
    wordcount = row['WC']
    
    #features
    #Distancing (0.19) - six letter words, articles, past tense, and the inverse of first person singular pronouns, present tense and discrepancy words
    
    Distancing = (sixletter + articles + pasttense + firstperson + presenttense + discrepancy)/6 #average of 6 intermediate numbers
    
    #Readability (-0.32) - multiplying negative one by six letter words, adding words per sentence, subtracting the amount of words recognized by the LIWC dictionary, and adding three
#     Readability = (-1 * sixletter) + wps - wordcount + 3
    Readability = row['readability']
    
    #Swear (0.31) 
    Swear = row['swear']
    
    #Anger (0.22)
    Anger = row['anger']
    
    #Agreeability (-0.43)
    Agreeability = row['Agreeableness']
        
    #Neuroticism (0.3)
    Neuroticism = row['Emotional range']
    
    #calculate composite score
    Total = 0.19 + 0.32 + 0.31 + 0.22 + 0.3 + 0.43
    Psychopathy_Score = 0.19/Total * Distancing - 0.32/Total * Readability + 0.31/Total * Swear + 0.22/Total * Anger - 0.43/Total * Agreeability + 0.3/Total * Neuroticism
    
    #write into dictionary
    return Psychopathy_Score

updated_df['composite score'] = updated_df.apply(composite_score, axis=1)

In [180]:
def score_percentile(score):
    max_score = updated_df['composite score'].max()
    min_score = updated_df['composite score'].min()
    score_range = max_score - min_score
    return (score-min_score)/score_range*100

updated_df['composite score percentile'] = updated_df['composite score'].apply(score_percentile)

In [181]:
updated_df = updated_df[['User', 'Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Emotional range', 'WC', 'Analytic', 'Clout', 'Authentic', 'Tone', 'WPS', 'Sixltr', 'Dic', 'function', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they', 'ipron', 'article', 'prep', 'auxverb', 'adverb', 'conj', 'negate', 'verb', 'adj', 'compare', 'interrog', 'number', 'quant', 'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', 'social', 'family', 'friend', 'female', 'male', 'cogproc', 'insight', 'cause', 'discrep', 'tentat', 'certain', 'differ', 'percept', 'see', 'hear', 'feel', 'bio', 'body', 'health', 'sexual', 'ingest', 'drives', 'affiliation', 'achieve', 'power', 'reward', 'risk', 'focuspast', 'focuspresent', 'focusfuture', 'relativ', 'motion', 'space', 'time', 'work', 'leisure', 'home', 'money', 'relig', 'death', 'informal', 'swear', 'netspeak', 'assent', 'nonflu', 'filler', 'AllPunc', 'Period', 'Comma', 'Colon', 'SemiC', 'QMark', 'Exclam', 'Dash', 'Quote', 'Apostro', 'Parenth', 'OtherP', 'readability', 'composite score', 'composite score percentile']]

In [240]:
updated_df.head()

Unnamed: 0,User,Openness,Conscientiousness,Extraversion,Agreeableness,Emotional range,WC,Analytic,Clout,Authentic,...,Exclam,Dash,Quote,Apostro,Parenth,OtherP,readability,composite score,composite score percentile,psychopath
0,12gaBrowningGal,76.5,65.61,54.9,78.88,43.96,388,27.36,34.94,68.96,...,12.37,0.52,0.0,3.61,0.0,0.0,18.249702,-12.462168,55.478689,0
1,15Stepz,74.81,69.07,56.49,82.39,41.08,254,26.07,70.94,84.8,...,0.39,1.57,0.0,4.33,1.57,0.0,10.837813,-12.550057,55.06985,0
2,16_MileyCyrus,67.28,63.78,52.56,81.1,56.37,1164,21.04,64.42,58.8,...,13.57,0.43,0.0,1.37,0.26,1.03,54.34338,-17.747231,30.893563,0
3,18percentgrey,78.94,60.79,52.82,77.25,46.15,931,47.25,53.87,58.83,...,2.26,1.93,0.0,3.22,1.29,2.26,43.858291,-16.418514,37.074506,0
4,19fischi75,70.69,62.71,44.75,80.32,51.83,1707,48.43,48.83,63.95,...,1.64,3.87,0.0,0.0,2.58,3.81,78.600174,-22.679146,7.95121,0


In [241]:
updated_df.sort_values(by='composite score', ascending=False)

Unnamed: 0,User,Openness,Conscientiousness,Extraversion,Agreeableness,Emotional range,WC,Analytic,Clout,Authentic,...,Exclam,Dash,Quote,Apostro,Parenth,OtherP,readability,composite score,composite score percentile,psychopath
47,Abby_ox,64.52,51.86,39.56,66.06,77.37,305,56.42,4.43,97.80,...,0.66,0.00,0.00,3.28,0.00,2.30,13.664762,-2.891426,100.000000,1
239,BethanyAnn614,67.97,56.48,49.19,72.73,87.82,379,54.79,23.01,73.69,...,3.43,0.00,0.00,1.06,0.53,1.06,17.784407,-2.944168,99.754655,1
235,Benjimonicus,68.73,52.83,42.49,63.95,76.37,385,59.27,26.65,78.40,...,1.04,0.26,0.00,4.68,0.00,0.52,17.613205,-3.044657,99.287200,1
244,BiGVixXen,68.36,50.65,42.85,72.13,81.86,225,9.74,8.43,99.00,...,11.56,0.00,0.00,8.00,0.00,0.00,10.407696,-3.155676,98.770759,1
494,DaniScot,69.67,50.53,41.53,67.40,75.14,289,65.89,9.44,99.00,...,5.88,0.00,0.00,3.11,0.69,0.69,13.353784,-3.633829,96.546479,1
424,CoConutShelle,63.47,57.43,47.24,72.90,81.17,289,60.29,18.41,53.02,...,7.61,0.35,0.00,2.08,0.00,0.00,13.433522,-3.821409,95.673893,1
269,BonjourHoney,75.07,50.75,43.40,74.96,72.85,171,6.03,59.24,50.35,...,0.00,0.00,0.00,5.85,0.00,0.00,7.616863,-4.538181,92.339604,1
209,Bastante_P,69.27,49.89,46.49,68.52,68.92,305,43.39,14.70,80.82,...,3.93,0.00,0.00,4.92,0.33,0.66,13.180705,-4.573028,92.177502,1
30,ACsBarbieGirl69,66.75,52.23,44.40,70.72,70.38,306,33.80,13.32,62.28,...,14.71,0.33,0.00,0.00,0.00,0.98,14.223398,-4.757017,91.321616,1
219,Beccixo,68.74,55.63,47.50,67.37,65.93,269,83.28,27.61,81.57,...,0.00,0.37,0.00,0.37,0.37,0.74,10.827494,-4.775903,91.233765,1


In [252]:
psychopath_list = []
for index, row in updated_df.iterrows():
    # This is with the assumption that Boy_Kill_Boy is the threshold for psychopathy
    if row['composite score'] > -5.76:
        psychopath_list.append(1)
    else: 
        psychopath_list.append(0)

In [253]:
updated_df['psychopath'] = psychopath_list

In [254]:
NLP_testing_df = updated_df[['User','psychopath']]

In [255]:
NLP_testing_df = pd.merge(NLP_testing_df,consolidated_tweets_df.rename(columns={'user':'User'}),how='left')[['User','tweets','psychopath']]

The process below is based on: https://towardsdatascience.com/sentiment-analysis-with-python-part-1-5ce197074184

In [256]:
# PREPROCESSING
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

stop_words = set(stopwords.words('english')) 

def clean_tweets(text):
    tokens = word_tokenize(text)
    # Remove stopwords and lowercase all words
    filtered_sentence = [w.lower() for w in tokens if w not in stop_words] 
    # lowercase all words
    return ' '.join(filtered_sentence)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\steve\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [257]:
NLP_testing_df_cleaned = NLP_testing_df.copy()
NLP_testing_df_cleaned['tweets'] = NLP_testing_df_cleaned['tweets'].apply(clean_tweets)

In [258]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(NLP_testing_df_cleaned['tweets'], NLP_testing_df_cleaned['psychopath'], test_size=0.3, random_state=42)

In [259]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True)
cv.fit(NLP_testing_df_cleaned['tweets'])
X_train = cv.transform(X_train)
X_test = cv.transform(X_test)

In [270]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
    
lr = LogisticRegression()
lr.fit(X_train, y_train)
print("Accuracy:", accuracy_score(y_test, lr.predict(X_test)))

Accuracy: 0.96


In [263]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), lr.coef_[0]
    )
}

print('Words that predict psychopathy\n')
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)
    
print('======================================')
print('Words that predict non-psychopathy\n')
    
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)
    
#     ('worst', -1.367978497228895)
#     ('waste', -1.1684451288279047)
#     ('awful', -1.0277001734353677)
#     ('poorly', -0.8748317895742782)
#     ('boring', -0.8587249740682945)

Words that predict psychopathy

('feel', 0.24034385653673868)
('crap', 0.21321869721288625)
('hurts', 0.20832143889043878)
('ruin', 0.18293869976184784)
('now', 0.18211319057874928)
Words that predict non-psychopathy

('good', -0.47845020492140033)
('sorry', -0.30238455431403044)
('thanks', -0.28741226789028057)
('great', -0.2846610662154322)
('love', -0.2757442051412337)


In [269]:
test_text = cv.transform(['Have a good day!'.lower()])
lr.predict(test_text)[0]

0

In [271]:
test_text = cv.transform(['My head hurts!'.lower()])
lr.predict(test_text)[0]

1