In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Standards

- ## NLP
  - ### EDA NLP
  - ### Unigram analysis using LDA
  - ### Bigram analysis using LDA
- ## CLASSIFICATION PROBLEM
- ## REGRESSION PROBLEM

# NLP

## Usual Dependencies

In [None]:
#Standards
import numpy as np
import pandas as pd

#For preprocessing Note: 
import re  
from nltk.stem import WordNetLemmatizer
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

#For Modeling and Visualizing the result
import pyLDAvis.gensim
import gensim
from gensim import corpora # for creating dictionary
from gensim import models # topic modeler
from gensim.utils import simple_preprocess

#EDA related
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import spacy

#Just to keep you sane
import warnings
warnings.filterwarnings("ignore")

### EDA NLP

#### Using Count Vectorizer to get top words (frequency) in a series

In [None]:
count_vectorizer = CountVectorizer(stop_words = "english")

In [10]:
def get_top_n_words(n_top_words, count_vectorizer, text_data):
    """
    n_top_words : Enter n number of words to get
    count_vectorizer : CountVectorizer params
    text_data : Series of text
    
    returns : words(actual words), word_values(number of occurance)
    """
    
    vectorized_headlines = count_vectorizer.fit_transform((text_data.values))
    
    vectorized_total = np.sum(vectorized_headlines, axis = 0)
    word_indices = np.flip(np.argsort(vectorized_total)[0,:], 1)
    word_values = np.flip(np.sort(vectorized_total)[0,:],1)
    
    word_vectors = np.zeros((n_top_words, vectorized_headlines.shape[1]))
    for i in range(n_top_words):
        word_vectors[i,word_indices[0,i]] = 1
    
    
    words = [word[0].encode("ascii").decode("utf-8") for word in count_vectorizer.inverse_transform(word_vectors)]

    return (words, word_values[0,:n_top_words].tolist()[0])

In [None]:
words, word_values = get_top_n_words(n_top_words = 15,
                                     count_vectorizer = count_vectorizer,
                                     text_data = data["Likelihood to Recommend (NPS) Comment"].dropna())

fig = plt.figure(figsize = (17, 8))
ax = fig.add_subplot(111)

ax.bar(range(len(words)), word_values, color = "dodgerblue")
ax.set_xticks(range(len(words)))
ax.set_xticklabels(words)
ax.set_title("Top 15 Words in General", fontsize = 20)


plt.show()

#### Named Entity Recognition using Spacy

Suppose "x" is a series of sentence

In [None]:
nlp = spacy.load("en_core_web_lg")

In [None]:
def get_entity(sentence):
    entity_text = []
    entity_label = []
    
    sen = nlp(sentence)
    
    for entity in sen.ents:
        
        entity_text.append(entity.text)
        entity_label.append(entity.label_)
        
    
    return entity_text, entity_label

In [None]:
doc_entities = [get_entity(v) for v in x]

In [None]:
def entity_cleaner(get_ent_data):
    
    list_word_entities_docs = []
    for idx, (list_word, list_label) in enumerate(get_ent_data):
        if len(list_word) > 0:
            for word, label in zip(list_word, list_label):
                list_word_entities_docs.append((word, label, idx))
                
    return pd.DataFrame(list_word_entities_docs, columns=["Word", "Label", "Index"])

In [None]:
df_word_entities_docs = entity_cleaner(doc_entities)

In [None]:
df_word_entities_docs.head()

### Unigram analysis using LDA

#### Preprocessing

##### Route 1

In [None]:
def lemmatize_stemming(word):
    """
    Accepts a word and transforms it into stemmed and lemmatized
    
    :returns: lemmatized and stemmed word
    """
    return WordNetLemmatizer().lemmatize(word, pos = "v")

def preprocess(sentence):
    """
    Tokenizes the words in a string and removes the stopwords
    
    :returns: returns a list with lemmatized & stemmed tokenized words
    """
    
    result = []
    for token in gensim.utils.simple_preprocess(sentence):
        if isinstance(token, str) == True:
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
                result.append(lemmatize_stemming(token))
    return result

In [None]:
processed_docs = x.map(preprocess)

##### Route 2

In [None]:
nlp = spacy.load("en_core_web_lg", disable=["ner", "parser"]) #<- Removes named entity recognition and parser

In [None]:
def lemma_stop(doc):
    """
    Lemmatizes and removes the stop words
    """

    txt = [token.lemma_ for token in doc if not token.is_stop] 

    if len(txt) > 2:
        return " ".join(txt) #adds spaces in between the text that was lemmatized

In [None]:
brief_cleaning = (re.sub("[^A-Za-z']+", " ", str(row)).lower() for row in clean_data["Likelihood to Recommend (NPS) Comment"])

processed_docs = [lemma_stop(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)] #applying the function, removes the stop words and lemmatize

#### Creating a dictionary

In [None]:
dictionary = corpora.Dictionary(processed_docs)

In [None]:
count = 0
for a, b in dictionary.iteritems():
    print(a, b)
    count += 1
    
    
    if count > 10:
        break

In [None]:
dictionary.filter_extremes(no_below = 2, keep_n = 10000) #When you're trying to control the content

#### Creating bag of words corpus

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [None]:
bow_doc = bow_corpus[1]

for i in range(len(bow_doc)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc[i][0],
                                                     dictionary[bow_doc[i][0]],
                                                     bow_doc[i][1]))

#### Modeling using LDA

In [None]:
n_topics = 5

lda = models.LdaMulticore(bow_corpus, num_topics = n_topics, id2word = dictionary, passes = 10, workers = 1, random_state = 0)

#### Visualizing using pyLDAvis

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda, bow_corpus, dictionary)
vis

### Bigram analysis using LDA

Where "x" is the series of data 

In [None]:
def list_to_sentence(bigram_data):

    return " ".join(word for word in bigram_data)

In [None]:
# Create Unigram
bigram_data = x
bigram_data = bigram_data.map(preprocess)
unigram_data = bigram_data

# Create Bigram
bigram_data = bigram_data.map(list_to_sentence)
bigram_data = bigram_data.str.findall(r"(?=(\b\w+\b \S+))")

In [None]:
combined_data = bigram_data + unigram_data #combines the list of bigram and unigram into one list

#### Creating bigram dictionary

In [None]:
dictionary_bigram = corpora.Dictionary(combined_data)

In [None]:
dictionary_bigram.filter_extremes(no_below = 2, keep_n = 10000)

#### Creating bigram bag of words

In [None]:
bow_corpus_bigram = [dictionary_bigram.doc2bow(doc) for doc in combined_data]

#### Modeling using LDA

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_bigram, bow_corpus_bigram, dictionary_bigram)
vis

#### Visualizing using pyLDAvis

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_bigram, bow_corpus_bigram, dictionary_bigram)
vis

------------------------------------------------------------

In [1]:
a = []

In [2]:
type(a)

list

In [7]:
assert type(a) == list

In [24]:
s1 = pd.DataFrame([1,2,3,4], columns = ["Test"])

In [25]:
dtype(s1["Test"])

dtype('int64')

In [26]:
assert dtype(s1["Test"]) == "O"

AssertionError: 

In [14]:
dtype(s1)

TypeError: data type not understood

In [11]:
s1.str.isalpha()

0     True
1    False
2    False
3    False
dtype: bool

In [None]:
pd.read_json()

In [40]:
args1 = "psi"
args2 = 'lavr'
args3 = 'dsc'
args4 = 'ifsc'
args5 = 'ics'
args6 = 'eng'
args7 = 'ol1'
args8 = 'ol2'
args9 = 'reso'
args10 = 'educ'
args11 = 'listen'
args12 = 'time'
args13 = 'probe'
args14 = 'unders'

args_list = [args1, args2, args3, args4, args5, args6, args7, args8, args9, args10, args11, args12, args13, args14]

In [45]:
a = pd.DataFrame(arg_list, index = arg_list).T

In [41]:
a.columns = args_list

In [47]:
a.shape

(1, 14)

In [51]:
b = {"Nesting_Quality_Prediction":[1,0],"Nesting_AHT_Prediction":[1,0],"Nesting_DSAT_Prediction":[1,0],"30_Days_Quality_Prediction":[1,0],"30_Days_AHT_Prediction":[1,0],"30_Days_DSAT_Prediction":[1,0]}

In [52]:
type(b)

dict

In [53]:
b

{'Nesting_Quality_Prediction': [1, 0],
 'Nesting_AHT_Prediction': [1, 0],
 'Nesting_DSAT_Prediction': [1, 0],
 '30_Days_Quality_Prediction': [1, 0],
 '30_Days_AHT_Prediction': [1, 0],
 '30_Days_DSAT_Prediction': [1, 0]}

In [57]:
pd.read_json({'Nesting_Quality_Prediction': [1, 0],
 'Nesting_AHT_Prediction': [1, 0],
 'Nesting_DSAT_Prediction': [1, 0],
 '30_Days_Quality_Prediction': [1, 0],
 '30_Days_AHT_Prediction': [1, 0],
 '30_Days_DSAT_Prediction': [1, 0]}, orient = "record")

ValueError: Invalid file path or buffer object type: <class 'dict'>

In [83]:
a = ['psi', 'lavr', 'dsc', 'ifsc', 'ics', 'eng', 'ol1', 'ol2', 'reso', 'educ', 'listen', 'time', 'probe', 'unders']
b = ['1', '1', '1', '1', '1', '3', '1', '1', '1', '1', '1', '1', '1', '1']
c = zip(a,b)
d = dict(c)

In [48]:
PSI = 'Training PSI'
LAVR = 'Training LAVR'
DSC = 'Training DSC'
IFSC = 'Training IFSC'
ICSSBT = 'Training ICS-SBT % Score'
ENGLISH_ASSESS = 'Training English Assessment % Score'
ONLINE_ASSESS = 'Training Online Assessment % Score'
ONLINE_ASSESS_RETAKE = 'Training Online Assessment Retake'
RESOLUTION = 'Training QM - Correct Resolution (25%)'
EDUCATE = 'Training QM - Educate the customer (25%)'
LISTENING = 'Training QM - Actively Listening to the customer (12.50%)'
TIME = 'Training QM - Time Management (12.50%)'
PROBING = 'Training QM - Ask relevant & effective probing question (12.50%)'
UNDERSTAND = 'Training QM - Easy to understand (12.50%)'

In [80]:
asd.replace()

Unnamed: 0,0
psi,1
lavr,1
dsc,1
ifsc,1
ics,1
eng,3
ol1,1
ol2,1
reso,1
educ,1


In [68]:
asd = pd.DataFrame.from_dict(d, orient = "index")

index_map = {'psi': PSI, 'lavr': LAVR, 'dsc': DSC, 'ifsc': IFSC, 'ics': ICSSBT,
             'eng': ENGLISH_ASSESS, 'ol1': ONLINE_ASSESS, 'ol2': ONLINE_ASSESS_RETAKE,
             'reso': RESOLUTION, 'educ': EDUCATE, 'listen': LISTENING, 'time': TIME, 'probe': PROBING, 'unders': UNDERSTAND}

In [81]:
asd.index = asd.index.map(index_map)

In [86]:
asd = asd.T

In [87]:
asd

Unnamed: 0,Training PSI,Training LAVR,Training DSC,Training IFSC,Training ICS-SBT % Score,Training English Assessment % Score,Training Online Assessment % Score,Training Online Assessment Retake,Training QM - Correct Resolution (25%),Training QM - Educate the customer (25%),Training QM - Actively Listening to the customer (12.50%),Training QM - Time Management (12.50%),Training QM - Ask relevant & effective probing question (12.50%),Training QM - Easy to understand (12.50%)
0,1,1,1,1,1,3,1,1,1,1,1,1,1,1


In [9]:
a = [1]

In [12]:
b = pd.DataFrame(a, index = ["asd"])

In [14]:
b.T

Unnamed: 0,asd
0,1


In [84]:
x = pd.read_json("accounts.json")

In [85]:
x

Unnamed: 0,Department,Id,LOBs,Name
0,Operations,1,"[{'Id': 1, 'AccountId': 1, 'Name': 'ADC', 'Sta...",Abbott
1,Operations,2,"[{'Id': 3, 'AccountId': 2, 'Name': 'Chat', 'St...",Ally CS
2,Operations,3,"[{'Id': 6, 'AccountId': 3, 'Name': 'Non Reg â€...",Ameriprise Financials
3,Operations,4,"[{'Id': 13, 'AccountId': 4, 'Name': 'Exception...",Bank of America
4,Operations,5,"[{'Id': 16, 'AccountId': 5, 'Name': 'CCT', 'St...",Capital One Canada
5,Operations,7,"[{'Id': 21, 'AccountId': 7, 'Name': 'CRS', 'St...",Citi
6,Operations,8,"[{'Id': 23, 'AccountId': 8, 'Name': 'Sales', '...",Expedia
7,Operations,11,"[{'Id': 28, 'AccountId': 11, 'Name': 'English'...",Hotelbeds
8,Operations,12,"[{'Id': 31, 'AccountId': 12, 'Name': '6J', 'St...",HP
9,Operations,13,"[{'Id': 33, 'AccountId': 13, 'Name': 'Innovati...",Innovations Team


In [87]:
x.to_json("asd.json", orient = "records")

In [24]:
x.to_csv("dsa.csv", index = False)

In [26]:
dsa = [[{'Id': 1, 'AccountId': 1, 'Name': 'ADC', 'StartDate': '2018-01-01T00:00:00', 'EndDate': None}],
[{'Id': 3, 'AccountId': 2, 'Name': 'Chat', 'StartDate': '2018-01-01T00:00:00', 'EndDate': None}, {'Id': 4, 'AccountId': 2, 'Name': 'Voice', 'StartDate': '2018-01-01T00:00:00', 'EndDate': None}],
[{'Id': 6, 'AccountId': 3, 'Name': 'Non Reg Ã¢â‚¬â€œ BNB', 'StartDate': '2018-01-01T00:00:00', 'EndDate': None}, {'Id': 7, 'AccountId': 3, 'Name': 'Non Reg Ã¢â‚¬â€œ CCQ/ICE', 'StartDate': '2018-01-01T00:00:00', 'EndDate': None}, {'Id': 8, 'AccountId': 3, 'Name': 'Non Reg Ã¢â‚¬â€œ Client Data/FP', 'StartDate': '2018-01-01T00:00:00', 'EndDate': None}, {'Id': 9, 'AccountId': 3, 'Name': 'Non Reg Ã¢â‚¬â€œ cWeb', 'StartDate': '2018-01-01T00:00:00', 'EndDate': None}, {'Id': 11, 'AccountId': 3, 'Name': 'FINRA-RSO_OD', 'StartDate': '2018-01-01T00:00:00', 'EndDate': None}, {'Id': 12, 'AccountId': 3, 'Name': 'FINRA-Certs/529', 'StartDate': '2018-01-01T00:00:00', 'EndDate': None}],
[{'Id': 13, 'AccountId': 4, 'Name': 'Exception Queue (EQ)', 'StartDate': '2018-01-01T00:00:00', 'EndDate': None}, {'Id': 14, 'AccountId': 4, 'Name': 'Lockbox Centralized Operations Keying (LCOK)', 'StartDate': '2018-01-01T00:00:00', 'EndDate': None}],
]

In [28]:
pd.DataFrame(dsa)

Unnamed: 0,0,1,2,3,4,5
0,"{'Id': 1, 'AccountId': 1, 'Name': 'ADC', 'Star...",,,,,
1,"{'Id': 3, 'AccountId': 2, 'Name': 'Chat', 'Sta...","{'Id': 4, 'AccountId': 2, 'Name': 'Voice', 'St...",,,,
2,"{'Id': 6, 'AccountId': 3, 'Name': 'Non Reg Ã¢â...","{'Id': 7, 'AccountId': 3, 'Name': 'Non Reg Ã¢â...","{'Id': 8, 'AccountId': 3, 'Name': 'Non Reg Ã¢â...","{'Id': 9, 'AccountId': 3, 'Name': 'Non Reg Ã¢â...","{'Id': 11, 'AccountId': 3, 'Name': 'FINRA-RSO_...","{'Id': 12, 'AccountId': 3, 'Name': 'FINRA-Cert..."
3,"{'Id': 13, 'AccountId': 4, 'Name': 'Exception ...","{'Id': 14, 'AccountId': 4, 'Name': 'Lockbox Ce...",,,,


In [106]:
quality_nesting_preds = pd.DataFrame([{'upper' : 12, 'score': 13, 'lower': 2}], index = ["Nesting_Quality_Prediction"])
aht_nesting_preds = pd.DataFrame([{'upper' : 12, 'score': 13, 'lower': 2}], index = ["Nesting_AHT_Prediction"])
dsat_nesting_preds = pd.DataFrame([{'upper' : 12, 'score': 13, 'lower': 2}], index = ["Nesting_DSAT_Prediction"])

In [107]:
dfs = [quality_nesting_preds, aht_nesting_preds, dsat_nesting_preds]

a = pd.DataFrame

In [108]:
pd.concat(dfs, ignore_index=False).T.to_json("asd.json")

In [None]:
predictions = reduce(lambda left, right: pd.merge(left, right, left_index = True, right_index = True), dfs)

In [101]:
a.append(quality_nesting_preds, aht_nesting_preds).T.to_json("asd.json")

TypeError: cannot concatenate object of type "<class 'type'>"; only pd.Series, pd.DataFrame, and pd.Panel (deprecated) objs are valid

In [83]:
pd.DataFrame([{'asd' : 0.52, 'ew': 0.1, 'uy': 12}, {'asd' : 0.52, 'ew': 0.1, 'uy': 12}]).to_json("asd.json")

In [74]:
pd.DataFrame([{'asd' : 0.52, 'ew': 0.1, 'uy': 12}, {'asd' : 12, 'ew': 13, 'uy': 2}])

Unnamed: 0,asd,ew,uy
0,0.52,0.1,12
1,12.0,13.0,2


In [7]:
import pandas as pd
import numpy as np

In [8]:
make_df = ["Hello", "World", "abcd", 12.4, np.nan,  "qwerty123"]
df = pd.DataFrame(make_df, columns = ["col1"])

In [11]:
df.loc[df["col1"].isna() == False]

Unnamed: 0,col1
0,Hello
1,World
2,abcd
3,12.4
5,qwerty123


In [12]:
elements = [(1,1,1),(2,3,7),(3,5,10)]

In [14]:
n = 2 # N. . .
[x[n] for x in elements]

[1, 7, 10]

In [1]:
mylist = ["a", "b", "a", "c", "c"]
mylist = list(dict.fromkeys(mylist))
print(mylist) 

['a', 'b', 'c']


In [50]:
def one():
    return print("one")

def two():
    return print("two")

def three(dsa, asd):
    return print(dsa + asd)

In [59]:
args = 3

def get_account(argument): 
    functions = { 
        1: one, 
        2: two, 
        3: three, 
    } 
    return functions.get(argument, "nothing") 

get_account(args)(1,5)

6


In [60]:
import requests

In [83]:
def get_web(accountID):
    """
    
    :accountID: INT dtype
    
    Gets the corresponding account name from an API
    """
    
    get_url = requests.get("http://phmnl5dev025:30004/v1/accounts").json()
    dict_account = dict([(x["Id"], x["Name"]) for x in get_url])

    return dict_account.get(accountID, "None")

args = 1

str.lower(get_web(args))

'abbott'

In [84]:
a = ["act","psi","lavr","dsc","ifsc","ics","eng","ol1","ol2","reso","educ","listen","time","probe","unders"]
b = ["act"]

a - b

TypeError: unsupported operand type(s) for -: 'list' and 'list'

In [85]:
pd.DataFrame()

Unnamed: 0,asd,ew,uy
0,0.52,0.1,12
1,12.0,13.0,2


In [101]:
data = [{'a' : 1, 'b': 1, 'c': 1},
        {'a' : 2, 'b': 2, 'c': 2},
        {'a' : 3, 'b': 3, 'c': 3},
        {'a' : 4, 'b': 4, 'c': 4},
        {'a' : 5, 'b': 5, 'c': 5},
        {'a' : 1, 'b': 2, 'c': 3},
        {'a' : 4, 'b': 5, 'c': 1},
        {'a' : 3, 'b': 4, 'c': 5},
        ]

In [110]:
data = [{'a' : "[123] [321]"},
        {'a' : "[456] [123]"},
        {'a' : "[789] [321]"},
        {'a' : "[987] [456]"}]

In [111]:
data = pd.DataFrame(data)
data

Unnamed: 0,a
0,[123] [321]
1,[456] [123]
2,[789] [321]
3,[987] [456]


In [104]:
data = data[data.nunique(axis=1).ne(1)]

In [105]:
data

Unnamed: 0,a,b,c
5,1,2,3
6,4,5,1
7,3,4,5


In [4]:
data = [{"a": ["123","321"]},
        {"a": ["456","654"]},
        {"a": ["312","456"]},
       ]

data = pd.DataFrame(data)
data

Unnamed: 0,a
0,"[123, 321]"
1,"[456, 654]"
2,"[312, 456]"


In [15]:
data = [{"a": "123; 321"},
        {"a": "456; 654"},
        {"a": "312; 456"},
       ]

data = pd.DataFrame(data)
data

Unnamed: 0,a
0,123; 321
1,456; 654
2,312; 456


In [70]:
separator = data["a"].apply(lambda x : x.split(";"))
separator

0    [123,  321]
1    [456,  654]
2    [312,  456]
Name: a, dtype: object

In [77]:
for the_list in separator:
    for x in the_list:
        print(x.strip())

123
321
456
654
312
456


In [57]:
get_list = [{v: 1} for v in get_list]
get_list

[{'123': 1}, {'321': 1}, {'456': 1}, {'654': 1}, {'312': 1}, {'456': 1}]

In [58]:
pd.DataFrame(get_list)

Unnamed: 0,123,312,321,456,654
0,1.0,,,,
1,,,1.0,,
2,,,,1.0,
3,,,,,1.0
4,,1.0,,,
5,,,,1.0,
