In [37]:
import pandas as pd
import numpy as np
import datetime
import sys
import codecs
import re
import urllib3
import itertools, collections
 
import nltk  # Natural Language Processing
#nltk.download('punkt')
#nltk.download('all')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords # list of words
from collections import Counter  # optimized way to do this
import string  # list(string.punctuation) - produces a list of punctuations
import copy
from itertools import product, tee, combinations, chain
from nltk.stem import PorterStemmer
from operator import itemgetter # help with dataframes
 
from scipy.spatial.distance import cosine
 
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin
from sklearn.utils import shuffle
 


In [38]:
df = pd.read_csv('./twitterBios.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.head()

Unnamed: 0,user,bio
0,MikePaineShow,#voiceover talent voice actor & radio guy for ...
1,gablescinema,South Florida's premiere nonprofit cinema for ...
2,dhumpachika,Follow us for the best of what's happening abo...
3,DavyVara,Filmmaker Writer Activist Aspiring Actor... I ...
4,keeeykeh,Dedicated student of the beautiful game. Head ...


In [39]:
# some constants for regex
START_OF_LINE = r"^"
OPTIONAL = "?"
ANYTHING = "."
ZERO_OR_MORE = "*"
ONE_OR_MORE = "+"

SPACE = "\s"
SPACES = SPACE + ONE_OR_MORE
NOT_SPACE = "[^\s]" + ONE_OR_MORE
EVERYTHING_OR_NOTHING = ANYTHING + ZERO_OR_MORE

ERASE = ""
FORWARD_SLASH = "\/"
NEWLINES = r"[\r\n]"

In [40]:
df = df.dropna(axis=0)
df = df.reset_index(drop=True)

def clean_up(text):
    RE_TWEET = START_OF_LINE + "RT" + SPACES
    print(text)
    text = re.sub(RE_TWEET, ERASE, text)
    HYPERLINKS = ("http" + "s" + OPTIONAL + ":" + FORWARD_SLASH + FORWARD_SLASH
              + NOT_SPACE + NEWLINES + ZERO_OR_MORE)

    text = re.sub(HYPERLINKS, ERASE, text)
    HASH = "#"
    text = re.sub(HASH, ERASE, text)
    return text

In [41]:
df['bio']

0      #voiceover talent voice actor & radio guy for ...
1      South Florida's premiere nonprofit cinema for ...
2      Follow us for the best of what's happening abo...
3      Filmmaker Writer Activist Aspiring Actor... I ...
4      Dedicated student of the beautiful game. Head ...
                             ...                        
149    The Cook Museum of Natural Science is a 501c3,...
150    Prologue seeks to to enrich the lives of every...
151    Private Law teacher at University.\r\nInto The...
152    Official CSUF Parking & Transportation Service...
153    Fashion Sunglasses  for Men & Women\r\nOur Mot...
Name: bio, Length: 154, dtype: object

In [42]:
df['bio'] = df['bio'].apply(clean_up)

#voiceover talent voice actor & radio guy for 30+ years. My wife is my best friend & I LOVE soup!
South Florida's premiere nonprofit cinema for the best independent, foreign, and cult classic films digitally restored and on 35mm & 70mm.
Follow us for the best of what's happening about #Cycling and #biking around the world | Chika chikaaaaaaa ka bro!
Filmmaker Writer Activist Aspiring Actor... I MAKE FILMS AND VIDEOS THAT MAKE PEOPLE THINK... I ASK THE QUESTIONS YOU DON’T HAVE THE COJONES TO ASK.
Dedicated student of the beautiful game. Head Coach at College of the Desert, Men's Soccer. Christ, canines, and coffee enthusiast. #Ao1
In the business of mastering my energy to be my most authentic empowered self and create something that will change my world.
Southern California’s Premier Electric Scooter Store. 
Contact us: sales@electriccityrides.net
Fast and friendly roll off dumpster company. Geared more towards residential demand. 10, 15, and 20 cubic yard dumpsters.
The JP Strategies t

In [43]:
df.iloc[16, 0:3]

user                                            LisaMatik
bio     School Psych at MVHS; Assistant Girls XC Coach...
Name: 16, dtype: object

In [44]:
# Similarity Measure *******************************************************************************************
def cosine_sim(v1, v2):
         
    rho = round(1.0 - cosine(v1, v2), 3)
    rho = rho if(not np.isnan(rho)) else 0.0
    return rho
 
# Words Replacement ***************************************************************************************
def replace_all(text, dic):
    for i, j in dic.items():
        text = text.replace(i, j)
    return text
 
# Function to find element with Maximum Frequency in TDM  *******************************************************************
def nanargmax(a):
    idx = np.argmax(a, axis=None)
    multi_idx = np.unravel_index(idx, a.shape)
    if np.isnan(a[multi_idx]):
        nan_count = np.sum(np.isnan(a))
 
        idx = np.argpartition(a, -nan_count-1, axis=None)[-nan_count-1]
        multi_idx = np.unravel_index(idx, a.shape)
    return multi_idx
 
# Define Top K Neighbours to the WORD or TWEET ***************************************************************************
def K_neighbor(k, term, list_t):
     
    # list_t - a list of tuples
    # term - value of criteria (tweet or word)
     
    neighbor = []
    
    for item in list_t:
        if term in item:
            neighbor.append(item) 
     
    neighbor.sort(key = itemgetter(0), reverse=True)
       
    print ('Top ', k, ' elements for ', term)   
    print('**********************************************')
         
    for i in range(k):
        print (neighbor[i])
     
    return neighbor[:k]
 
# Determine Pair of Words Counter method ******************************************************************************
def Pair_words(word_list, tweet_clean_fin, n_top):
 
    pairs = list(itertools.combinations(word_list, 2)) # order does not matter
 
    #pairs = set(map(tuple, map(sorted, _pairs)))
    pairs = set(pairs)
    c = collections.Counter()
 
    for tweet in tweet_clean_fin:
        for pair in pairs:
            if pair[0] == pair[1]: 
                pass
            elif pair[0] in tweet and pair[1] in tweet:
                #c.update({pair: 1})
                c[pair] +=1
  
    return c.most_common(n_top)
 
# BIC score function ********************************************************************************
 
from sklearn import cluster
from scipy.spatial import distance
from sklearn.preprocessing import StandardScaler
 
def compute_bic(kmeans,X):
    """
    Computes the BIC metric for given clusters
 
    Parameters:
    -----------------------------------------
    kmeans:  List of clustering object from scikit learn
 
    X     :  multidimension np array of data points
 
    """
    # assign centers and labels
    centers = [kmeans.cluster_centers_]
    labels  = kmeans.labels_
    #number of clusters
    m = kmeans.n_clusters
    # size of the clusters
    n = np.bincount(labels)
    #size of data set
    N, d = X.shape
 
    #compute variance for all clusters beforehand
    cl_var =  (1.0 / (N - m) / d) * sum([sum(distance.cdist(X[np.where(labels == i)], [centers[0][i]], 'euclidean')**2) for i in range(m)])
    const_term = 0.5 * m * np.log(N) * (d+1)
 
    BIC = np.sum([n[i] * np.log(n[i]) -
               n[i] * np.log(N) -
             ((n[i] * d) / 2) * np.log(2*np.pi*cl_var) -
             ((n[i] - 1) * d/ 2) for i in range(m)]) - const_term
 
    return(BIC)

In [45]:
tweet_list = df['bio'].tolist() # convert DF to list (tweets only) NOT a nested list

emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
# Regex_str is used to GET text from CSV file
 
regex_str = [
     
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-signs
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)' # other words
]
 
# These Regex are used to EXCLUDE items from the text AFTER IMPORTING from csv with regex_str
 
numbers = r'(?:(?:\d+,?)+(?:\.?\d+)?)'
URL = r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+'
html_tag = r'<[^>]+>'
hash_tag = r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"
at_sign = r'(?:@[\w_]+)'
dash_quote = r"(?:[a-z][a-z'\-_]+[a-z])"
other_word = r'(?:[\w_]+)'
other_stuff = r'(?:\S)' # anything else - NOT USED
start_pound = r"([#?])(\w+)" # Start with #
start_quest_pound = r"(?:^|\s)([#?])(\w+)" # Start with ? or with #
cont_number = r'(\w*\d\w*)' # Words containing numbers
 
# My REGEX **************************************************************************
 
#      Remove '[' and ']' brackets
 
sq_br_f = r'(?:[[\w_]+)' # removes '['
sq_br_b = r'(?:][\w_]+)' # removes ']'
 
rem_bracket = r'(' + '|'.join([sq_br_f, sq_br_b]) +')'
rem_bracketC = re.compile(rem_bracket, re.VERBOSE)
 
# Removes all words of 3 characters or less *****************************************************
 
short_words = r'\W*\b\w{1,3}\b' # Short words of 3 character or less
short_wordsC = re.compile(short_words, re.VERBOSE | re.IGNORECASE)
 
# REGEX remove all words with \ and / combinations
 
slash_back =  r'\s*(?:[\w_]*\\(?:[\w_]*\\)*[\w_]*)'
slash_fwd = r'\s*(?:[\w_]*/(?:[\w_]*/)*[\w_]*)'
slash_all = r'\s*(?:[\w_]*[/\\](?:[\w_]*[/\\])*[\w_]*)'
 
# REGEX numbers, short words and URL only to EXCLUDE +++++++++++++++++++++++++++++++++++++++++++++++++++
 
num_url_short = r'(' + '|'.join([numbers, URL, short_words + sq_br_f + sq_br_b]) +')'  # Exclude from tweets
comp_num_url_short = re.compile(num_url_short, re.VERBOSE | re.IGNORECASE)
 
# Master REGEX to INCLUDE from the original tweets ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
list_regex = r'(' + '|'.join(regex_str) + ')'
 
master_regex = re.compile(list_regex, re.VERBOSE | re.IGNORECASE) # TAKE from tweets INITIALLY


In [46]:
# Filters IMPORTED from csv file data
 
def filterPick(list, filter):
    return [ ( l, m.group(1) ) for l in list for m in (filter(l),) if m]
 
search_regex = re.compile(list_regex, re.VERBOSE | re.IGNORECASE).search
 
# Use tweetList -  that is a list from DF (using .tolist())
 
outlist_init = filterPick(tweet_list, search_regex) # It is a tuple: initial list from all tweets
 
char_remove = [']', '[', '(', ')', '{', '}'] # characters to be removed
words_keep = ['old', 'new', 'age', 'lot', 'bag', 'top', 'cat', 'bat', 'sap', 'jda', 'tea', 'dog', 'lie', 'law', 'lab',\
             'mob', 'map', 'car', 'fat', 'sea', 'saw', 'raw', 'rob', 'win', 'can', 'get', 'fan', 'fun', 'big',\
             'use', 'pea', 'pit','pot', 'pat', 'ear', 'eye', 'kit', 'pot', 'pen', 'bud', 'bet', 'god', 'tax', 'won', 'run',\
              'lid', 'log', 'pr', 'pd', 'cop', 'nyc', 'ny', 'la', 'toy', 'war', 'law', 'lax', 'jfk', 'fed', 'cry', 'ceo',\
              'pay', 'pet', 'fan', 'fun', 'usd', 'rio']
 
emotion_list = [':)', ';)', '(:', '(;', '}', '{','}']
word_garb = ['here', 'there', 'where', 'when', 'would', 'should', 'could','thats', 'youre', 'thanks', 'hasn',\
             'thank', 'https', 'since', 'wanna', 'gonna', 'aint', 'http', 'unto', 'onto', 'into', 'havent',\
             'dont', 'done', 'cant', 'werent', 'https', 'u', 'isnt', 'go', 'theyre', 'each', 'every', 'shes', 'youve', 'youll',\
            'weve', 'theyve']
 
# Dictionary with Replacement Pairs ******************************************************************************
repl_dict = {'googleele': 'goog', 'lyin': 'lie', 'googles': 'goog', 'aapl':'apple',\
             'msft':'microsoft', 'google': 'goog', 'googl':'goog'}
 
exclude = list(string.punctuation) + emotion_list + word_garb
 
# Convert tuple to a list, then to a string; Remove the characters; Stays as a STRING. Porter Stemmer
 
stemmer=PorterStemmer()
lmtzr = WordNetLemmatizer()

In [47]:
# Convert tuple to a list, then to a string; Remove the characters; Stays as a STRING. Porter Stemmer
 
# Preparing CLEAN tweets tp keep SEPARATELY from WORDS in TWEETS
 
tweet_clean_fin = [] # Cleaned Tweets - Final Version
for tweet in outlist_init:
 
    tw_clean = []
    tw_clean = [ch for ch in tweet if ch not in char_remove]
 
    tw_clean = re.sub(URL, "", str(tw_clean))
    tw_clean = re.sub(html_tag, "",str(tw_clean))
    tw_clean = re.sub(hash_tag, "",str(tw_clean))
    tw_clean = re.sub(slash_all,"", str(tw_clean))
    tw_clean = re.sub(cont_number, "",str(tw_clean))
    tw_clean = re.sub(numbers, "",str(tw_clean))
    tw_clean = re.sub(start_pound, "",str(tw_clean))
    tw_clean = re.sub(start_quest_pound, "",str(tw_clean))
    tw_clean = re.sub(at_sign, "",str(tw_clean))
    tw_clean = re.sub("'", "",str(tw_clean))
    tw_clean = re.sub('"', "",str(tw_clean))
    tw_clean = re.sub(r'(?:^|\s)[@#].*?(?=[,;:.!?]|\s|$)', r'', tw_clean) # Removes # and @ in words (lookahead)
 
    tw_clean = lmtzr.lemmatize(str(tw_clean))
    #tw_clean = stemmer.stem(str(tw_clean))
     
    tw_clean_lst = re.findall(r'\w+', str(tw_clean))
     
    tw_clean_lst = [tw.lower() for tw in tw_clean_lst if tw.lower() not in stopwords.words('english')]
    tw_clean_lst = [word for word in tw_clean_lst if word not in exclude]
    tw_clean_lst = str([word for word in tw_clean_lst if len(word)>3 or word.lower() in words_keep])
     
    tw_clean_lst = re.findall(r'\w+', str(tw_clean_lst))
    tw_clean_lst = [replace_all(word, repl_dict) for word in tw_clean_lst]
     
    tweet_clean_fin.append(list(tw_clean_lst))
 
# Delete various elements from the text (LIST OF WORDS)
 
out_list_fin = []
out_string_temp = ''.join([ch for ch in str(list(outlist_init)) if ch not in char_remove])
 
out_string_temp = re.sub(URL, "", out_string_temp)
out_string_temp = re.sub(html_tag, "", out_string_temp)
out_string_temp = re.sub(hash_tag, "", out_string_temp)
out_string_temp = re.sub(slash_all,"", str(out_string_temp))
out_string_temp = re.sub(cont_number, "", out_string_temp) 
out_string_temp = re.sub(numbers, "", out_string_temp)
out_string_temp = re.sub(start_pound, "", out_string_temp)
out_string_temp = re.sub(start_quest_pound, "", out_string_temp)
out_string_temp = re.sub(at_sign, "", out_string_temp)
out_string_temp = re.sub("'", "", out_string_temp)
out_string_temp = re.sub('"', "", out_string_temp)
out_string_temp = re.sub(r'(?:^|\s)[@#].*?(?=[,;:.!?]|\s|$)', r'', out_string_temp) # Removes # and @ in words (lookahead)
 
out_list_w = re.findall(r'\w+', out_string_temp)
 
out_string_short = str([word.lower() for word in out_list_w if len(word)>3 or word.lower() in words_keep])
 
out_list_w = re.findall(r'\w+', out_string_short)   
 
out_list_w = [lmtzr.lemmatize(word) for word in out_list_w]
#out_list_w = [stemmer.stem(word) for word in out_list_w]
out_list_w = [word.lower() for word in out_list_w if word.lower() not in stopwords.words('english')]  # Remove stopwords
out_list_w = str([word.lower() for word in out_list_w if word not in exclude])
out_string_rpl = replace_all(out_list_w, repl_dict) # replace all words from dictionary
 
# Convert "Cleaned" STRING to a LIST
 
out_list_fin = re.findall(r'\w+', out_string_rpl)
 
list_len = len(out_list_fin)
word_list = set(out_list_fin) # list of unique words from all tweets - SET
word_list_len = len(word_list)
 
print ("Set = ", word_list_len, "Original Qty = ", list_len)
print (word_list)
print( '********************************************************************************************************'
)
print (tweet_clean_fin)
print (len(tweet_clean_fin))

rly', 'reading', 'promoting', 'flower', 'advice', 'empowering', 'humanrights', 'freedom', 'affordable', 'official', 'resort', 'unapologetic', 'crime', 'campus', 'tiger', 'summerhillpreschool', 'city', 'yogi', 'yard', 'village', 'invest', 'peer', 'moderator', 'huntsville', 'retweet', 'personal', 'trainer', 'deja', 'jesus', 'body', 'post', 'news', 'please', 'rabbit', 'block', 'october', 'exec', 'acttoo', 'dowellanddoubtnot', 'winning', 'counselor', 'manufacturer', 'service', 'past', 'connecting', 'contract', 'milujem', 'check', 'charlotte', 'training', 'national', 'wife', 'soup', 'desert', 'teampeek', 'advocate', 'finish', 'shipping', 'boutique', 'amplify', 'narodil', 'quality', 'bringing', 'solution', 'opportunity', 'board', 'jazyk', 'stampede', 'appearance', 'cardiologist', 'authentic', 'space', 'clinical', 'potential', 'grade', 'apparel', 'market', 'transportation', 'fine', 'boy', 'storyteller', 'struggling', 'comedy', 'prek', 'oblakov', 'helpthosewhoteach', 'smile', 'teacher', 'eveyo

In [48]:
# Create a matrix of frequencies for word pairs
 
words = {v:k for (k, v) in enumerate(word_list)}
keys = words.keys() # List all UNIQUE words in the dictionary from all CLEANED tweets   
l_keys = len(keys) 
 
matrix_pair = np.zeros([l_keys, l_keys]) # store all combination of keys
 
for tweet in tweet_clean_fin:
    word_l = []
     
    for word in tweet:
        word_l.append(word)         # List of words from ONE CLEANED tweet
    
    items = set(word_l)  #set of words in from ONE CLEANED tweet
    items = [term for term in items if term in keys] # take only words from a tweet that are in keys
    index = [words[pos] for pos in items] # positions of the words

    for i1 in index: 
        for i2 in index:
            if i1< i2:
                matrix_pair[i1][i2] += 1  #frequency
                 
print ("Frequency Matrix *********************************************")
print (matrix_pair)

print ('Maximum Frequency', np.max(matrix_pair))
 
idx1, idx2 = nanargmax(matrix_pair)
 
print ("Indexes for a pair with max frequency - ", idx1, idx2)
print ("Pair of Words with Max Frequency: Word1 - ", words.keys())
 
# Selecting TOP N elements from the Matrix ##########################################################################
 
n_top = 10
 
matrix_pairF = matrix_pair.flatten()
idx_f = matrix_pairF.argsort()[-n_top:]
x_idx, y_idx = np.unravel_index(idx_f, matrix_pair.shape)
 
for x, y, in zip(x_idx, y_idx):
    print("Frequency = ", matrix_pair[x][y], "index1 = ", x, "index2 = ", y, "Word1 - ", words.keys())
# print("items!!!!",items)


', 'group', 'need', 'operated', 'kitten', 'online', 'supporter', 'peace', 'daughter', 'recruitment', 'experiencebwi', 'rock', 'roll', 'control', 'committed', 'make', 'divorced', 'person', 'father', 'touring', 'lot', 'beyond', 'photo', 'attraction', 'balm', 'everything', 'learning', 'alum', 'network', 'extermination', 'houston', 'pennstateproud', 'package', 'redeemed', 'engagement', 'born', 'activistny', 'neovendetta', 'create', 'restored', 'century', 'latest', 'cat', 'search', 'husband', 'earth', 'norm', 'healthtech', 'purpose', 'card', 'umsl', 'mental', 'meaningful', 'empowered', 'pursuit', 'difficult', 'csulb', 'eye', 'best', 'keepkamras', 'mhhsbd', 'activism', 'downtown', 'kinesiology', 'north', 'model', 'improving', 'coach', 'energy', 'showgirl', 'designing', 'furniture', 'first', 'communication', 'driving', 'keeppounding', 'progressive', 'retail', 'genesee', 'follow', 'issue', 'luggage', 'ohio', 'female', 'woman', 'grandpa', 'nutritionist', 'bitch', 'united', 'lady', 'visionforbla

In [49]:
df.index

RangeIndex(start=0, stop=154, step=1)

In [50]:
# Create document-term-matrix
num_tweets = 93
columns = word_list
ncols = word_list_len + 1
 
term_doc = pd.DataFrame(columns = columns)
term_doc.insert(0, "bio", "user")
term_doc["bio"] = df["bio"]
# print(df['bio'])
term_doc.fillna(0, inplace=True)
 
i_row = 0
for line in tweet_clean_fin:
     
    for word in line:
 
        for col in range(1, ncols-1):
            if word == term_doc.columns[col]: term_doc.iloc[i_row, col] += 1
 
    i_row += 1
 
# DataFrame for Statistics with Totals by Row and by Column
     
statDF = copy.deepcopy(term_doc)
columns_cl = ["bio", "user"]
tweet_sim = pd.DataFrame(columns = columns_cl)
tweet_sim = df["bio"]
tweet_sim.fillna(0.0, inplace=True)
 
# Sum Rows by Columns
row_sum = statDF.sum(axis=1)
statDF["Total"] = row_sum
# print ('Row Max Value = ', row_sum.max())
# print ("Max Value DF = ", statDF["Total"].max(axis=0))
 
# Sum Columns by Row:
col_list = list(statDF)
col_list.remove('bio')
 
rsum = {col: statDF[col].sum() for col in col_list}
# Turn the sums into a DataFrame with one row with an index of 'Total':
sum_df = pd.DataFrame(rsum, index=["Total"])
# Now append the row:
statDF = statDF.append(sum_df)
 
# Calculate Similarity of Unique Words
tup_word = [] # need to pull column headers and rows af words
sim_word = np.zeros((ncols, ncols))
 
# for i in range(ncols-1):
     
#     v1 = [0.0]*ncols
#     v1 = term_doc.iloc[:, i+1]
     
#     for k in range(ncols-1):
         
#         v2 = [0.0]*ncols 
#         if i >= k: pass
#         else:
#             v2 = term_doc.iloc[:, k+1]
#             similar = cosine_sim(v1, v2)
#             tup_w = (similar, list(columns)[i], list(columns)[k])
 
#             tup_word.append(tup_w)
#             sim_word[i,k] = similar
#             sim_word[k,i] = similar
     
#     sim_word[i,i] = 1.0
 
# sim_word[ncols-1,ncols-1] = 1.0
 
# print ('Similarity for Words: Words = ', word_list_len)
# print (sim_word)
 
# SIMILARITY for TWEETS
tu_tweet = []
sim_tweet = np.zeros((num_tweets, num_tweets))
 
for i in range(num_tweets):
    v1 = [0.0]*num_tweets
    v1 = term_doc.iloc[i, 1:]
     
    for k in range(num_tweets):
         
        v2 = [0.0]*num_tweets
        if i >= k: pass
        else:
            v2 = term_doc.iloc[k, 1:]
            similar = cosine_sim(v1, v2)
            tup_twe = (similar, term_doc['bio'][i], term_doc['bio'][k])
            
            tu_tweet.append(tup_twe) 
            sim_tweet[i, k] = similar
            sim_tweet[k, i] = similar
    sim_tweet[i,i] = 1.0

print ("Similarity for Bios: Bios = ", num_tweets)
print (sim_tweet)
 
statDF.tail()

  dist = 1.0 - uv / np.sqrt(uu * vv)
Similarity for Bios: Bios =  93
[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


Unnamed: 0,bio,curated,jordan,anne,brass,justice,jewelry,count,tuffiest,trying,...,missing,adventurer,imthatchickwhoaintafraid,patient,platform,emergingtech,gem,directory,combining,Total
150,Prologue seeks to to enrich the lives of every...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
151,Private Law teacher at University.\r\nInto The...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,14
152,Official CSUF Parking & Transportation Service...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,11
153,Fashion Sunglasses for Men & Women\r\nOur Mot...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
Total,,1,0,1,1,1,1,0,1,1,...,1,2,1,0,1,1,0,1,0,1421


In [51]:
# Determine Top N TWEETS / WORDS
 
K_neighbor(n_top, df['bio'][92], tu_tweet)  #Top tweets for a given tweet


Top  10  elements for  I love my family, my cats, my friends, and my job, as well as traveling, volleyball, sunshine, the beach, great deals and contests.
**********************************************
(0.0, 'voiceover talent voice actor & radio guy for 30+ years. My wife is my best friend & I LOVE soup!', 'I love my family, my cats, my friends, and my job, as well as traveling, volleyball, sunshine, the beach, great deals and contests.')
(0.0, "South Florida's premiere nonprofit cinema for the best independent, foreign, and cult classic films digitally restored and on 35mm & 70mm.", 'I love my family, my cats, my friends, and my job, as well as traveling, volleyball, sunshine, the beach, great deals and contests.')
(0.0, "Follow us for the best of what's happening about Cycling and biking around the world | Chika chikaaaaaaa ka bro!", 'I love my family, my cats, my friends, and my job, as well as traveling, volleyball, sunshine, the beach, great deals and contests.')
(0.0, 'Filmmaker 

[(0.0,
  'voiceover talent voice actor & radio guy for 30+ years. My wife is my best friend & I LOVE soup!',
  'I love my family, my cats, my friends, and my job, as well as traveling, volleyball, sunshine, the beach, great deals and contests.'),
 (0.0,
  "South Florida's premiere nonprofit cinema for the best independent, foreign, and cult classic films digitally restored and on 35mm & 70mm.",
  'I love my family, my cats, my friends, and my job, as well as traveling, volleyball, sunshine, the beach, great deals and contests.'),
 (0.0,
  "Follow us for the best of what's happening about Cycling and biking around the world | Chika chikaaaaaaa ka bro!",
  'I love my family, my cats, my friends, and my job, as well as traveling, volleyball, sunshine, the beach, great deals and contests.'),
 (0.0,
  'Filmmaker Writer Activist Aspiring Actor... I MAKE FILMS AND VIDEOS THAT MAKE PEOPLE THINK... I ASK THE QUESTIONS YOU DON’T HAVE THE COJONES TO ASK.',
  'I love my family, my cats, my friends

In [52]:
df['bio']

0      voiceover talent voice actor & radio guy for 3...
1      South Florida's premiere nonprofit cinema for ...
2      Follow us for the best of what's happening abo...
3      Filmmaker Writer Activist Aspiring Actor... I ...
4      Dedicated student of the beautiful game. Head ...
                             ...                        
149    The Cook Museum of Natural Science is a 501c3,...
150    Prologue seeks to to enrich the lives of every...
151    Private Law teacher at University.\r\nInto The...
152    Official CSUF Parking & Transportation Service...
153    Fashion Sunglasses  for Men & Women\r\nOur Mot...
Name: bio, Length: 154, dtype: object

In [53]:
def tweet_prep(df):
     
    tweet_list = df['bio'].tolist()
    tweet_list_clean = df['bio'].tolist()
    word_list_cl = [[word for word in str(line).split()] for line in tweet_list_clean]
    word_list_tot = list(chain.from_iterable(word_list_cl))
     
    set_word = set(word_list_tot) # from clean tweets
     
    return Pair_words(set_word, tweet_list_clean, n_top)
 
print ("Top ", n_top, " pairs of words")
 
most_comm = Pair_words(word_list, tweet_clean_fin, n_top)
 
print (most_comm)

Top  10  pairs of words
[(('winning', 'award'), 4), (('high', 'school'), 4), (('aspiring', 'think'), 3), (('filmmaker', 'make'), 3), (('people', 'think'), 3), (('filmmaker', 'writer'), 3), (('activist', 'think'), 3), (('activist', 'people'), 3), (('make', 'aspiring'), 3), (('make', 'writer'), 3)]


In [57]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
 
from sklearn.preprocessing import Normalizer
from sklearn import metrics
 
# K-Means Processing
n_clst = 4
cluster_doc = term_doc.drop(['bio'], axis=1)
 
kmeans = KMeans(n_clusters=n_clst, init='k-means++', random_state=0, max_iter=100, n_init=10, verbose=True)
print("Clustering sparse data with %s" % kmeans)
 
kmeans.fit(cluster_doc)
 
cluster_num = kmeans.predict(cluster_doc)
tweet_clean_list = [" ".join(tweet) for tweet in tweet_clean_fin]
 
labels = kmeans.labels_
df['BioClusters'] = kmeans.labels_
cluster_centers = kmeans.cluster_centers_
labels_unique = np.unique(labels)
 
lenlb = len(labels_unique)
label_elem = np.zeros([lenlb])
 
print (len(cluster_num), len(term_doc), len(tweet_clean_list), len(tweet_clean_fin))
 
cluster_tweet = pd.DataFrame({"bio": term_doc['bio'], "Cluster_Num": cluster_num, "Clean_Bio": tweet_clean_list})
tweet_prep(cluster_tweet)
 
cluster_top_pair = cluster_tweet.groupby("Cluster_Num").apply(tweet_prep)
elem_cluster = np.bincount(labels) # Number of elements per Cluster
print ("Top Cluster Pair")
print (cluster_top_pair)

for i in labels_unique:
    label_elem[i] = 0
     
    for l in labels:
        if l == i: label_elem[i] +=1
    print ("Label = ", i, "  Number of Elements = ", label_elem[i]) 
 
samp_size = min(num_tweets, 300) 
 
silh_score = metrics.silhouette_score(cluster_doc, labels, metric='euclidean', sample_size=samp_size)
print ("Silhouette score = ", round(silh_score, 3), "  for Sample Size = ", samp_size)
 
cluster_arr = cluster_doc.to_numpy()
BIC = compute_bic(kmeans,cluster_arr)
print ('BIC Score = ', round(BIC, 3))

Clustering sparse data with KMeans(max_iter=100, n_clusters=4, random_state=0, verbose=True)
Initialization complete
Iteration 0, inertia 1903.0
Iteration 1, inertia 1725.9470198675522
Converged at iteration 1: strict convergence.
Initialization complete
Iteration 0, inertia 2623.0
Iteration 1, inertia 1731.0224940968108
Iteration 2, inertia 1729.2499586661604
Iteration 3, inertia 1727.3763753574715
Iteration 4, inertia 1726.0130344108459
Converged at iteration 4: strict convergence.
Initialization complete
Iteration 0, inertia 2053.0
Iteration 1, inertia 1593.7551020408146
Converged at iteration 1: strict convergence.
Initialization complete
Iteration 0, inertia 3024.0
Iteration 1, inertia 1705.5808823529421
Converged at iteration 1: strict convergence.
Initialization complete
Iteration 0, inertia 1919.0
Iteration 1, inertia 1744.7019867549689
Converged at iteration 1: strict convergence.
Initialization complete
Iteration 0, inertia 1906.0
Iteration 1, inertia 1716.5540540540544
Conve

In [58]:
df.head()

Unnamed: 0,user,bio,BioClusters
0,MikePaineShow,voiceover talent voice actor & radio guy for 3...,2
1,gablescinema,South Florida's premiere nonprofit cinema for ...,2
2,dhumpachika,Follow us for the best of what's happening abo...,2
3,DavyVara,Filmmaker Writer Activist Aspiring Actor... I ...,0
4,keeeykeh,Dedicated student of the beautiful game. Head ...,2


In [61]:
for i in range(n_clst):
    Clust1wBios=df[df['BioClusters']==i]
    Clust1wBios.to_csv('{}/{}.csv'.format('BioClusters',i))

