In [None]:
import numpy as np
import pandas as pd
import re
import nltk
import spacy
import string
pd.options.mode.chained_assignment = None
#nltk.download('punkt')
#nltk.download('stopwords')

In [None]:
# Get Eassay 0 data 
full_df = pd.read_csv("../input/okcupid-profiles/okcupid_profiles.csv")
full_df['essay'] = full_df[full_df.columns[21:]].apply(
    lambda x: ' '.join(x.astype(str)),
    axis=1
)
df = full_df[["essay"]]
df["essay"] = df["essay"].astype(str)
df.head()


In [None]:
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import numpy as np
import pandas as pd
import re
import nltk
import spacy
import string
pd.options.mode.chained_assignment = None


class TextPreprocessor:
    APOSTROPHE = '\u2019'
    EMOTICONS_REGEX = r'[\U0001f600-\U0001f64f]+'
    DINGBATS_REGEX = r'[\U00002702-\U000027b0]+'
    TRANSPORT_AND_MAP_REGEX = r'[\U0001f680-\U0001f6c0]+'
    ENCLOSED_CHARS_REGEX = r'[\U000024c2-\U0001f251]+'
    MISC_REGEX = r'[\U000000a9-\U0001f999]'

    def make_lowercase(self, data_frame, column_name):
        data_frame[column_name] = data_frame[column_name].str.lower()
        #data_frame[column_name] = data_frame[column_name].apply(lambda texts: 
        print('make_lowercase applied')
        #print(data_frame[column_name])
        return data_frame

    def remove_punctuation(self, data_frame, column_name):
        PUNCT_TO_REMOVE = string.punctuation
        data_frame[column_name] = data_frame[column_name].apply(
            lambda text: text.translate(str.maketrans('', '', PUNCT_TO_REMOVE)))
        print('remove_punctuation applied')
        #print(data_frame[column_name])
        return data_frame

    def remove_stop_words(self, data_frame, column_name):
        STOPWORDS = set(stopwords.words('english'))
        data_frame[column_name] = data_frame[column_name].apply(
            lambda text: " ".join([word for word in str(text).split() if word not in STOPWORDS]))
        print('remove_stop_words applied')
        #print(data_frame[column_name])
        return data_frame

    def remove_frequent_words(self, data_frame, column_name):
        cnt = Counter()
        for text in data_frame[column_name].values:
            for word in text.split():
                cnt[word] += 1
        FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
        data_frame[column_name] = data_frame[column_name].apply(
            lambda text: " ".join([word for word in str(text).split() if word not in FREQWORDS]))
        print('remove_frequent_words applied')
        #print(data_frame[column_name])
        return data_frame

    def remove_rare_words(self, data_frame, column_name, max_rare_words_count=10):
        cnt = Counter()
        for text in data_frame[column_name].values:
            for word in text.split():
                cnt[word] += 1
        RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-max_rare_words_count - 1:-1]])
        data_frame[column_name] = data_frame[column_name].apply(
            lambda text: " ".join([word for word in str(text).split() if word not in RAREWORDS]))
        print('remove_rare_words applied')
        #data_frame.head(5)
        return data_frame

    def stem_words(self, data_frame, column_name):
        stemmer = PorterStemmer()
        data_frame[column_name] = data_frame[column_name].apply(
            lambda text: " ".join([stemmer.stem(word) for word in text.split()]))
        print('stem_words applied')
        #print(data_frame[column_name])
        return data_frame

    def lemmatize_words(self, data_frame, column_name):
        lemmatizer = WordNetLemmatizer()
        data_frame[column_name] = data_frame[column_name].apply(
            lambda text: " ".join([lemmatizer.lemmatize(word) for word in text.split()]))
        print('lemmatize_words applied')
        # print(data_frame[column_name])
        return data_frame
    
    def remove_numbers(self, data_frame, column_name):
        number_pattern = r'\d+'
        data_frame[column_name] = data_frame[column_name].apply(
            lambda text: re.sub(pattern=number_pattern, repl=" ", string=text))
        print('remove_numbers applied')
        return data_frame


    def lemmatize_words_v2(self, data_frame, column_name):
        lemmatizer = WordNetLemmatizer()
        wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}
        # pos_tagged_text = nltk.pos_tag(text.split())
        data_frame[column_name] = data_frame[column_name].apply(lambda text: " ".join(
            [lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in
             nltk.pos_tag(text.split())]))
        print('lemmatize_words_v2 applied')
        #print(data_frame[column_name])
        return data_frame

    def tokenize(self, data_frame, column_name):
        data_frame[column_name] = data_frame[column_name].apply(lambda text: nltk.tokenize.word_tokenize(text))
        print('tokenize applied')
        #print(data_frame[column_name])
        return data_frame


    def clean_text(self, data_frame, column_name):     
        data_frame_local = self.make_lowercase(data_frame, column_name)
        data_frame_local = self.remove_punctuation(data_frame_local, column_name)
        data_frame_local = self.remove_numbers(data_frame_local, column_name)
        data_frame_local = self.remove_stop_words(data_frame_local, column_name)
        data_frame_local = self.remove_rare_words(data_frame_local, column_name)
        data_frame_local = self.remove_frequent_words(data_frame_local, column_name)
        data_frame_local = self.lemmatize_words_v2(data_frame_local, column_name)
        #data_frame_local = self.tokenize(data_frame_local, column_name)
        return data_frame_local


    def remove_emojis(data):
        result = []
        for word in data:
            match = []
            match += re.findall(EMOTICONS_REGEX, word)
            match += re.findall(ENCLOSED_CHARS_REGEX, word)
            match += re.findall(DINGBATS_REGEX, word)
            match += re.findall(TRANSPORT_AND_MAP_REGEX, word)
            match += re.findall(MISC_REGEX, word)
            if not match == []:
                for item in match:
                    word = word.replace(item, '')
            result.append(word)
        return result


    def remove_empty_strings(data):
        return [word for word in data if word != '']



In [None]:
df_test = df.copy()

In [None]:
text_processor = TextPreprocessor()

In [None]:
df_test

In [None]:
#raw_data = df_test
df_test = df_test.sample(frac=0.01,random_state=10)
#raw_data = raw_data.drop(['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks', 'drugs', 'education', 'ethnicity', 'height', 'income', 'job', 'last_online', 'location', 'offspring', 'pets', 'religion', 'sign', 'smokes', 'speaks', 'essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8', 'essay9'], axis=1)

In [None]:
text_processor = TextPreprocessor()
df_clean = text_processor.clean_text(df_test, 'essay')
df_clean.head(5)

Test code

In [None]:
import numpy as np
import pandas as pd
from IPython.display import display
from tqdm import tqdm
from collections import Counter
import ast

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn as sb

from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob
import scipy.stats as stats

from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE

from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
output_notebook()

%matplotlib inline

In [None]:
# Define helper functions
def get_top_n_words(n_top_words, count_vectorizer, text_data):
    '''
    returns a tuple of the top n words in a sample and their 
    accompanying counts, given a CountVectorizer object and text sample
    '''
    vectorized_headlines = count_vectorizer.fit_transform(text_data.values)
    vectorized_total = np.sum(vectorized_headlines, axis=0)
    word_indices = np.flip(np.argsort(vectorized_total)[0,:], 1)
    word_values = np.flip(np.sort(vectorized_total)[0,:],1)
    
    word_vectors = np.zeros((n_top_words, vectorized_headlines.shape[1]))
    for i in range(n_top_words):
        word_vectors[i,word_indices[0,i]] = 1

    words = [word[0].encode('ascii').decode('utf-8') for 
             word in count_vectorizer.inverse_transform(word_vectors)]

    return (words, word_values[0,:n_top_words].tolist()[0])

In [None]:
count_vectorizer = CountVectorizer(stop_words='english')
words, word_values = get_top_n_words(n_top_words=15,
                                     count_vectorizer=count_vectorizer, 
                                     text_data=df_clean['essay'])

fig, ax = plt.subplots(figsize=(16,8))
ax.bar(range(len(words)), word_values);
ax.set_xticks(range(len(words)));
ax.set_xticklabels(words, rotation='vertical');
ax.set_title('Top words in headlines dataset (excluding stop words)');
ax.set_xlabel('Word');
ax.set_ylabel('Number of occurences');
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

from yellowbrick.text import FreqDistVisualizer
#from yellowbrick.datasets import load_hobbies

# Load the text data
#corpus = load_hobbies()

vectorizer = CountVectorizer()
docs       = vectorizer.fit_transform(df_clean['essay'])
features   = vectorizer.get_feature_names()

visualizer = FreqDistVisualizer(features=features, orient='v')
visualizer.fit(docs)
visualizer.show()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', 
max_features= 1000, # keep top 1000 terms 
max_df = 0.5, 
smooth_idf=True)

#Replace NaN with an empty string
#df_clean['essay'] = df_clean['essay'].fillna('')
df_clean = df_clean.dropna()

X = vectorizer.fit_transform(df_clean['essay'])

X.shape # check shape of the document-term matrix

In [None]:
X.getrow(0)

In [None]:
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

num_clusters = 10
num_seeds = 10
max_iterations = 300
labels_color_map = {
    0: '#20b2aa', 1: '#ff7373', 2: '#ffe4e1', 3: '#005073', 4: '#4d0404',
    5: '#ccc0ba', 6: '#4700f9', 7: '#f6f900', 8: '#00f91d', 9: '#da8c49'
}
pca_num_components = 2
tsne_num_components = 2

In [None]:
# texts_list = some array of strings for which TF-IDF is being computed

# calculate tf-idf of texts
tf_idf_vectorizer = TfidfVectorizer(analyzer="word", use_idf=True, smooth_idf=True, ngram_range=(1, 2))
tf_idf_matrix = tf_idf_vectorizer.fit_transform(df_clean['essay'])

In [None]:
dense = tf_idf_matrix.todense()
denselist = dense.tolist()
df = pd.DataFrame(
    denselist,columns=tf_idf_vectorizer.get_feature_names())

In [None]:
df.head(5)

In [None]:

# create k-means model with custom config
clustering_model = KMeans(
    n_clusters=num_clusters,
    max_iter=max_iterations,
    precompute_distances="auto",
    n_jobs=-1
)

labels = clustering_model.fit_predict(tf_idf_matrix)
# print labels
X = tf_idf_matrix.todense()

In [None]:
# ----------------------------------------------------------------------------------------------------------------------

reduced_data = PCA(n_components=pca_num_components).fit_transform(X)
# print reduced_data

fig, ax = plt.subplots()
for index, instance in enumerate(reduced_data):
    # print instance, index, labels[index]
    pca_comp_1, pca_comp_2 = reduced_data[index]
    color = labels_color_map[labels[index]]
    ax.scatter(pca_comp_1, pca_comp_2, c=color)
plt.show()

In [None]:
# t-SNE plot
embeddings = TSNE(n_components=tsne_num_components)
Y = embeddings.fit_transform(X)
plt.scatter(Y[:, 0], Y[:, 1], cmap=plt.cm.Spectral)
plt.show()

In [None]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(X, X)

In [None]:
from sklearn.decomposition import TruncatedSVD

# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=122)

svd_model.fit(X)

len(svd_model.components_)

In [None]:
terms = vectorizer.get_feature_names()

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    print("Topic "+str(i)+": ")
    topic_words = []
    for t in sorted_terms:
        topic_words.append(t[0])
    print(*topic_words)

In [None]:
# import umap

# X_topics = svd_model.fit_transform(X)
# embedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(X_topics)

In [None]:
# df_clean.head(5)

In [None]:
# import pandas as pd
# import numpy as np
# import nltk
# nltk.download('stopwords')
# from nltk.corpus import stopwords
# from sklearn.metrics.pairwise import linear_kernel
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer
# from nltk.tokenize import RegexpTokenizer
# import re
# import string
# import random
# from PIL import Image
# import requests
# from io import BytesIO
# import matplotlib.pyplot as plt
# %matplotlib inline
# from sklearn.metrics.pairwise import cosine_similarity
# from gensim.models import Word2Vec
# from gensim.models.phrases import Phrases, Phraser
# from matplotlib import pyplot
# from gensim.models import KeyedVectors

# plt.figure(figsize=(7,5))
# plt.scatter(embedding[:, 0], embedding[:, 1], 
# c = df_clean.essay,
# s = 10, # size
# edgecolor='none'
# )
# plt.show()

In [None]:
# from sklearn.datasets import fetch_20newsgroups

# dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))

In [None]:
# print(dataset.target)