# Data cleaning and preprocessing


## Libraries


In [1]:
# Installation

#! pip install pyLDAvis
!pip install --upgrade gensim
!pip install sklearn
!pip install numpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn
  Downloading sklearn-0.0.post5.tar.gz (3.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn: filename=sklearn-0.0.post5-py3-none-any.whl size=2950 sha256=64683d37b768ad63e24592e95539dee4efffd92aa74b97318df8746a6a026c5a
  Stored in directory: /root/.cache/pip/wheels/38/1f/8d/4f812c590e074c1e928f5cec67bf5053b71f38e2648739403a
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0.post5
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# Import libraries

import re                               # Regular Expressions
import time                             # Time Function Execution
import datetime as dt                   # Datetime utilities
import pandas   as pd                   # Dataframe Manipulation
import numpy    as np                   # Matrix Manipulation
import seaborn  as sns                  # Visualization and Plotting
import matplotlib.pyplot as plt         # Plotting

In [3]:
from sklearn import preprocessing       # Normalization and Data Transformations
from sklearn import feature_selection   # Feature Selection
from sklearn import feature_extraction  # Feature Extraction
from sklearn import model_selection     # Train-testing Split
from sklearn import utils               # Verification and Safe operations
from collections import Counter         # Frequency of words

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics  import accuracy_score

In [6]:
import nltk                                          # Natural Language Toolkit
import gensim                                        # Word2Vec
#import pyLDAvis.gensim
import gensim.downloader as api
from nltk.tokenize import word_tokenize
from gensim.models import doc2vec
from gensim        import corpora
nltk.download('stopwords')                           # Query stopwords list
nltk.download('wordnet')                             # Query wordnet for POS
nltk.download('averaged_perceptron_tagger')          # Used in Lemmatization
nltk.download('punkt')                               # Tokenizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
from multiprocessing import cpu_count

In [8]:
from pydrive.auth        import GoogleAuth           # Use Google Auth
from pydrive.drive       import GoogleDrive          # Mount Google's drive
from google.colab        import auth                 # Use Google Auth in Colab
from oauth2client.client import GoogleCredentials    # Api to load google credentials

### Libraries Configuration  
Adjust settings from differents libraries

In [9]:
sns.set_theme()

## Load Data  
First load the csv into /content/ directory.  
Run the code above and agree to allow colab to get access to the folder /content/  


In [10]:
# Import libraries to mount google drive home folder
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Change the string in var filepath to set where the path resides.

In [12]:
filepath = "/content/drive/MyDrive/EQUIPO NLP/RETO/datos/aspcliamsdb_r_sptkpi.xlsx"

Load Data:
- Set corresponding data type for each column.
- Use first row in csv as column's name.
- Leave rows with no data.

In [13]:
# Load Dataset
# Set types to each column
dtypes     = {"cm_id":'category', "claim_id":'category',"pl":'category',"productmodel":'string',"partno":'string',"totalamount":'float32',"program_number":'category',"program_name":'string',"descr":'string',"actual_activity":'string',"reseller":'string',"claim_status":'category',"business_type":'category',"activity_subtype":'category',"activity_subtype_id":'category'}
# Load from excel
raw_df     = pd.read_excel(filepath, header=0, verbose=True, dtype=dtypes, thousands=",") 

Reading sheet 0


  warn(msg)
  raw_df     = pd.read_excel(filepath, header=0, verbose=True, dtype=dtypes, thousands=",")
  raw_df     = pd.read_excel(filepath, header=0, verbose=True, dtype=dtypes, thousands=",")


### Set column type

In [14]:
raw_df["sdate"] = pd.to_datetime(raw_df["start_date"], errors="coerce", utc=True)   # Convert to Time Series. Return Date type
raw_df["edate"] = pd.to_datetime(raw_df["end_date"], errors="coerce", utc=True)     # Convert to Time Series. Return Date type

## Data Description

In [15]:
# Sort Columns by date
raw_df = raw_df.sort_values(by="sdate")
# Delete start_date and end_date columns
raw_df = raw_df.drop(columns=['start_date','end_date'])

In [16]:
raw_df.shape

(24608, 17)

## Cleaning

In [17]:
## Support Functions
def cleantext(text):
    text = str(text)
  # Clean: Convert to lowercase. Remove punctuation and characters, then strip.
    text_ph0 = text.lower()                                      # Lowercase text
    text_ph1 = re.sub('https?://\S+|www\.\S+|\.com', '', text_ph0)     # Remove URLs
    text_ph2 = re.sub('<.*>+', '', text_ph1)                     # Remove content inside brackets
    text_ph3 = re.sub('\n', '', text_ph2)                        # Remove EOL character
    text_ph4 = re.sub(r'[^\w\s]', ' ', str(text_ph3).strip())    # Mod: Add replace with whitespace to handle End.Begin phrase
    text_clean = text_ph4
    return text_clean


def remove_stopwords(stopwords, lst_text):
  return [word for word in lst_text if word not in stopwords]


def gen_stopwords(tokens,gamma):
    # Calculate word frequencies
    word_freq = Counter(tokens)       # Count frequency of words
    threshold = len(word_freq)//gamma # Set a fraction of dataset as filter lim
    stopwords = {word for word, count in word_freq.items() if count >= threshold}
    return stopwords, threshold


def gen_tokenize(dataset):
    # Tokenize and normalize the phrases
    tokens = []
    list_text = dataset              # Dataset to list
    for phrase in list_text:         # Select a phrase
        words = word_tokenize(phrase.lower()) # Lower phrase and tokenize
        tokens.extend(words)
    return tokens    


def stemtext(lst_text):
  ss = nltk.stem.snowball.SnowballStemmer("english", ignore_stopwords=True)     # Improve version of PorterStemming
  return [ss.stem(word) for word in lst_text]


def lemtext(lst_text):
  from nltk.stem   import WordNetLemmatizer
  lem          = WordNetLemmatizer()                           # Instance Lemmatizer
  tagged_words = nltk.pos_tag(lst_text)                        # Tuple with POS
  return [lem.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in tagged_words]


def get_wordnet_pos(treebank_tag):
  from nltk.corpus import wordnet
  if treebank_tag.startswith('J'):
    return wordnet.ADJ
  elif treebank_tag.startswith('V'):
    return wordnet.VERB
  elif treebank_tag.startswith('N'):
    return wordnet.NOUN
  elif treebank_tag.startswith('R'):
    return wordnet.ADV
  else:
    return wordnet.NOUN    # If POS is not recognized; Noun assumption.


In [18]:
# Generate DataFrame with columns name, description and target. Use name and description to predict
df = pd.DataFrame({'Name':raw_df['program_name'],'Descr':raw_df['descr'],'target':raw_df['activity_subtype']})

In [19]:
df.head(10)

Unnamed: 0,Name,Descr,target
20494,ASP Meijer BTS '21 Endcap,MARKET DEVELOPMENT FUNDS,BTS
23268,Costco.com C2'22 NB IR Program,undefined,Display
23440,Costco.com C2'22 NB IR Program,undefined,Display
13686,Target OJ5740 Markdown,EOL Program,UNKNOWN
17139,HP Amp Price Elasticity Test,SPECIAL NEGOTIATED DISCOUNT,UNKNOWN
7644,Q118 HPS Toner Private Offer Rebate,SPECIAL NEGOTIATED DISCOUNT,Program
1823,Q118 OPS Ink Private offer Rebate,SPECIAL NEGOTIATED DISCOUNT,Program
13057,DeskJet 3752 BTB'18 Price Protection,SPECIAL NEGOTIATED DISCOUNT,BTB
20606,Envy 4512 BTB'18 Price Protection,SPECIAL NEGOTIATED DISCOUNT,BTB
6096,Q118 OPSToner Private Offer Rebate,SPECIAL NEGOTIATED DISCOUNT,Program


Without stemming

In [20]:
# Fill undefined values in Descr with Name
replace_undefined                        = df['Name'][df['Descr'] == 'undefined']    # Search in Description the value undefined
df['Descr'].loc[replace_undefined.index] = replace_undefined                         # Replace undefined in Descr with Name of same row
df                                       = df.dropna()                               # Update DF: Remove Nan

# Clean: Convert to lowercase. Remove punctuation and characters, then strip
name_clean  = list(map(cleantext,df['Name']))                                        # Lowercase and remove symbols for every sentence in Name column
descr_clean = list(map(cleantext,df['Descr']))                                       # Lowercase and remove symbols for every sentence in Descr column

# Tokenize phrases
token_name  = [phrase.split() for phrase in name_clean]
token_desc  = [phrase.split() for phrase in descr_clean]

# Load stopwords
nltk_stopwords = set(nltk.corpus.stopwords.words("english"))
# Generate custom stopwords
stopwords, threshold = gen_stopwords(gen_tokenize(name_clean),4)                     # Generate stopwords for Name
stopwords.update(nltk_stopwords)                                                     # Add default stopwords
stopwords.update("program")
# Generate list to exclude from stopwords
exclude_stopwords = set(map(cleantext,raw_df["activity_subtype_id"].unique()))       # Initialice include subtypes
exclude_stopwords.remove('nan')                                                      # Remove Nan
exclude_stopwords.remove('other')                                                    # Remove Other
exclude_stopwords.remove('unknown')                                                  # Remove Unknown

# Generate a custom exclution list
exclution_list = {'asp','bby','printer','bts','best','buy','printers','amazon','btb','inkjet','ad','costco','walmart','ink','hyperx','toner'}
exclude_stopwords.update(exclution_list)                                             # Union of custom and default exclution set
stopwords_clean = list(stopwords - exclude_stopwords)                                # Remove exclution words from stopwords

# Remove stopwords
name = [[word for word in phrase if word not in stopwords_clean] for phrase in token_name]
desc = [[word for word in phrase if word not in stopwords_clean] for phrase in token_desc] 

# Create clean dataset
df_clean = pd.DataFrame({'Name':name,'Descr':desc,'Target':df['target']})            # Create DF: New DF with clean Data

# Show stopwords
print("Stopwords (Threshold: "+str(threshold)+") :", stopwords_clean)

Stopwords (Threshold: 394) : ['no', 'notebook', 'nov', 'funds', 'they', 'doesn', 'mightn', "shouldn't", 'their', "that'll", 'all', 'those', '6', 'this', "you've", 'under', 'it', 'new', 'from', 'but', 'don', 'o', 'doing', "it's", 'where', "mightn't", 'np', 'll', 'the', 'me', 'aren', 'isn', 'nor', 'himself', 'q2', 'that', 'by', 'yourself', 'between', 'there', 'recycle', "hadn't", 'mustn', 'having', "wasn't", 'you', "aren't", '2023', 'funding', 'h', 'in', 'while', 'same', "you'll", 'is', 'them', 'his', 'c2', 'through', 'yours', 'via', 'hers', '01', 'too', 'before', '10', 'so', "wouldn't", '31', 'here', 'own', 'h1', 'trade', 'because', 'just', 'shan', 'had', 'haven', 'now', '23', 'do', 're', 'of', '2019', 'against', 'fy', 'her', 'c1', '2', '3', 'ourselves', 'fq2', 'p', "should've", 'h2', 'with', '03', 'we', 'as', '5', 'm', 'q1', 'him', 'didn', 'and', 'have', 'y', 'did', 'needn', 's', 'promotion', '2021', '22', 'be', 'does', 'myself', 'hp', 'wouldn', "needn't", 'program', 'retail', 'itself'

In [21]:
df_clean    = df_clean.reset_index()
empty_descr = [i for i,x in enumerate(df_clean['Descr']) if not x]
empty_name  = [i for i,x in enumerate(df_clean['Name']) if not x]
df_clean['Descr'].loc[empty_descr] = df_clean['Name'].loc[empty_descr]                         # Replace undefined in Descr with Name of same ro
df_clean['Name'].loc[empty_name]   = df_clean['Descr'].loc[empty_name]
new_empty = [i for i,x in enumerate(df_clean['Name']) if not x]
df_clean = df_clean.drop(index=new_empty,axis=0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['Descr'].loc[empty_descr] = df_clean['Name'].loc[empty_descr]                         # Replace undefined in Descr with Name of same ro
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['Name'].loc[empty_name]   = df_clean['Descr'].loc[empty_name]


In [22]:
df_clean = df_clean.set_index('index')

In [23]:
df_clean.head(5)

Unnamed: 0_level_0,Name,Descr,Target
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20494,"[asp, meijer, bts, endcap]","[market, development]",BTS
23268,[costco],[costco],Display
23440,[costco],[costco],Display
13686,"[target, oj5740, markdown]","[target, oj5740, markdown]",UNKNOWN
17139,"[amp, price, elasticity, test]","[special, negotiated, discount]",UNKNOWN


In [None]:
#df_clean.to_csv("/content/drive/MyDrive/EQUIPO NLP/RETO/datos/df_clean.csv")

with stemming

In [24]:
# Apply stemming
name_stem   = [lemtext(sentence) for sentence in token_name]
desc_stem   = [lemtext(sentence) for sentence in token_desc]

# Load stopwords
nltk_stopwords = set(nltk.corpus.stopwords.words("english"))
# Generate custom stopwords
stopwords, threshold = gen_stopwords(gen_tokenize(name_clean),4)                      # Generate stopwords for Name
stopwords.update(nltk_stopwords)                                                     # Add default stopwordsi
stopwords.update("program")
# Generate list to exclude from stopwords
exclude_stopwords = set(map(cleantext,raw_df["activity_subtype_id"].unique()))       # Initialice include subtypes
exclude_stopwords.remove('nan')                                                      # Remove Nan
exclude_stopwords.remove('other')                                                    # Remove Other
exclude_stopwords.remove('unknown')                                                  # Remove Unknown

# Generate a custom exclution list
exclution_list = {'asp','bby','printer','bts','best','buy','printers','amazon','btb','inkjet','ad','costco','walmart','ink','hyperx','toner'}
exclude_stopwords.update(exclution_list)                                             # Union of custom and default exclution set
stopwords_clean = list(stopwords - exclude_stopwords)                                # Remove exclution words from stopwords

# Remove stopwords
name = [[word for word in phrase if word not in stopwords_clean] for phrase in name_stem]
desc = [[word for word in phrase if word not in stopwords_clean] for phrase in desc_stem] 

# Create clean dataset
df_stem = pd.DataFrame({'Name':name,'Descr':desc,'Target':df['target']})             # Create DF: New DF with clean Data
# Show stopwords
print("Stopwords (Threshold: "+str(threshold)+") :", stopwords_clean)

Stopwords (Threshold: 394) : ['no', 'notebook', 'nov', 'funds', 'they', 'doesn', 'mightn', "shouldn't", 'their', "that'll", 'all', 'those', '6', 'this', "you've", 'under', 'it', 'new', 'from', 'but', 'don', 'o', 'doing', "it's", 'where', "mightn't", 'np', 'll', 'the', 'me', 'aren', 'isn', 'nor', 'himself', 'q2', 'that', 'by', 'yourself', 'between', 'there', 'recycle', "hadn't", 'mustn', 'having', "wasn't", 'you', "aren't", '2023', 'funding', 'h', 'in', 'while', 'same', "you'll", 'is', 'them', 'his', 'c2', 'through', 'yours', 'via', 'hers', '01', 'too', 'before', '10', 'so', "wouldn't", '31', 'here', 'own', 'h1', 'trade', 'because', 'just', 'shan', 'had', 'haven', 'now', '23', 'do', 're', 'of', '2019', 'against', 'fy', 'her', 'c1', '2', '3', 'ourselves', 'fq2', 'p', "should've", 'h2', 'with', '03', 'we', 'as', '5', 'm', 'q1', 'him', 'didn', 'and', 'have', 'y', 'did', 'needn', 's', 'promotion', '2021', '22', 'be', 'does', 'myself', 'hp', 'wouldn', "needn't", 'program', 'retail', 'itself'

In [25]:
df_stem = df_stem.reset_index()
empty_descr = [i for i,x in enumerate(df_stem['Descr']) if not x]
empty_name  = [i for i,x in enumerate(df_stem['Name']) if not x]
df_stem['Descr'].loc[empty_descr] = df_stem['Name'].loc[empty_descr]                         # Replace undefined in Descr with Name of same ro
df_stem['Name'].loc[empty_name]   = df_stem['Descr'].loc[empty_name]
new_empty = [i for i,x in enumerate(df_stem['Name']) if not x]
df_stem = df_stem.drop(index=new_empty,axis=0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stem['Descr'].loc[empty_descr] = df_stem['Name'].loc[empty_descr]                         # Replace undefined in Descr with Name of same ro
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stem['Name'].loc[empty_name]   = df_stem['Descr'].loc[empty_name]


In [26]:
df_stem = df_stem.set_index('index')

In [27]:
df_stem.head(15)

Unnamed: 0_level_0,Name,Descr,Target
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20494,"[asp, meijer, bts, endcap]","[market, development, fund]",BTS
23268,[costco],[costco],Display
23440,[costco],[costco],Display
13686,"[target, oj5740, markdown]","[target, oj5740, markdown]",UNKNOWN
17139,"[amp, price, elasticity, test]","[special, negotiate, discount]",UNKNOWN
7644,"[q118, toner, private, offer, rebate]","[special, negotiate, discount]",Program
1823,"[q118, ink, private, offer, rebate]","[special, negotiate, discount]",Program
13057,"[deskjet, 3752, btb, 18, price, protection]","[special, negotiate, discount]",BTB
20606,"[envy, 4512, btb, 18, price, protection]","[special, negotiate, discount]",BTB
6096,"[q118, opstoner, private, offer, rebate]","[special, negotiate, discount]",Program


In [28]:
df_stem.loc[4632,'Name']

['q218', 'ink', 'private', 'offer', 'rebate']

Aclaración: se está usando lematización aunque los nombres de el dataset diga stemming gg.

In [None]:
# limpieza removido rows
24608 - 21091

In [None]:
#df_stem.to_csv("/content/drive/MyDrive/EQUIPO NLP/RETO/datos/df_lem.csv")

In [29]:
# Change from reggex to stem
data = df_clean

# Classification


In [30]:
# Split dataset into trainning-testing and validation
train_test_df  = data[['Target','Name','Descr']].query("Target != 'UNKNOWN'")
validation_df  = data[['Target','Name','Descr']].query("Target == 'UNKNOWN'")

In [31]:
train_test_df.to_csv("/content/drive/MyDrive/EQUIPO NLP/RETO/datos/df_nishi_clean.csv")
#validation_df.to_csv("/content/drive/MyDrive/EQUIPO NLP/RETO/datos/df_lem_Unknown.csv")

In [None]:
# mapping and dropping nan values

# Create a bijection from program category to ordinals
orden = {
"Program":        int(0),
"Display":        int(1),
"BTB":            int(2),
"Search":         int(3),
"Holiday":        int(4),
"BTS":            int(5),
"Email":          int(6),
"Digital":        int(7),
"Trad_media":     int(8)
}

train_test_df['Target'] = train_test_df['Target'].map(orden)    # Apply the bijection

  and should_run_async(code)


In [None]:
# Train and test split
X_train, X_test, y_train, y_test = model_selection.train_test_split (train_test_df['Name'], train_test_df[['Target']] , test_size=0.3)

  and should_run_async(code)


## Segmentation

In [None]:
program_df  = train_test_df[['Target','Name','Descr']].query("Target == 0")

  and should_run_async(code)


In [None]:
program_df_X = pd.DataFrame({'Corpus': program_df['Name'] + program_df['Descr']})

  and should_run_async(code)


In [None]:
program_df_X

  and should_run_async(code)


Unnamed: 0_level_0,Corpus
index,Unnamed: 1_level_1
7644,"[q118, toner, private, offer, rebate, special,..."
1823,"[q118, ink, private, offer, rebate, special, n..."
6096,"[q118, opstoner, private, offer, rebate, speci..."
22282,"[q118, ink, private, offer, rebate, special, n..."
12918,"[q118, ink, private, offer, rebate, special, n..."
...,...
14809,"[ad, fund, inkjet, printer, market, developmen..."
20225,"[amzn, coupon, best, deal, amzn, coupon, best,..."
20321,"[asp, essendant, dj, 3755, sale, goal, attainm..."
21736,"[ad, fund, inkjet, printer, market, developmen..."


In [None]:
text_data = program_df_X["Corpus"]
dictionary = corpora.Dictionary(text_data)
corpus     = [dictionary.doc2bow(text) for text in text_data]

  and should_run_async(code)


In [None]:
NUM_topics = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus,num_topics=NUM_topics,id2word=dictionary,passes=40)
ldamodel.save('/content/drive/MyDrive/EQUIPO NLP/RETO/datos/program_15.gensim')

  and should_run_async(code)


In [None]:
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.420*"chromebook" + 0.192*"national" + 0.105*"cq4" + 0.088*"e" + 0.061*"target" + 0.044*"contribution" + 0.011*"sam" + 0.008*"supply" + 0.005*"network" + 0.005*"club"')
(1, '0.393*"buy" + 0.363*"best" + 0.134*"q4" + 0.026*"t5" + 0.014*"series" + 0.014*"400" + 0.008*"bj" + 0.007*"bbm" + 0.006*"bjs" + 0.001*"cost"')
(2, '0.142*"asp" + 0.114*"billboard" + 0.110*"store" + 0.096*"printer" + 0.092*"fund" + 0.086*"development" + 0.085*"market" + 0.060*"2h" + 0.036*"brand" + 0.017*"express"')
(3, '0.190*"instant" + 0.184*"rebate" + 0.128*"laserjet" + 0.098*"offer" + 0.093*"consumer" + 0.091*"channel" + 0.091*"promotional" + 0.091*"scanner" + 0.010*"day" + 0.005*"free"')
(4, '0.247*"goal" + 0.245*"sale" + 0.241*"attainment" + 0.155*"asp" + 0.042*"april" + 0.038*"june" + 0.003*"furniture" + 0.003*"mart" + 0.003*"nebraska" + 0.003*"fyq4"')
(5, '0.620*"tank" + 0.142*"ce" + 0.132*"asp" + 0.044*"smart" + 0.020*"tin" + 0.007*"marine" + 0.007*"conn" + 0.005*"bronze" + 0.000*"silver" + 0.000*"jul

  and should_run_async(code)


In [None]:
lda_display = pyLDAvis.gensim.prepare(ldamodel,corpus,dictionary,sort_topics=False)
pyLDAvis.display(lda_display)

  and should_run_async(code)


# Representation

In [None]:
#model_params

#word2vec
vector_size = 100
window = 3
min_count = 2
use_sg = 0

#random forest
max_depth = 4

  and should_run_async(code)


In [None]:
#word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=vector_size,
                                   window=window,
                                   min_count=min_count,
                                   sg=use_sg)

  and should_run_async(code)


find new categories for program category

# Data classification


In [None]:
# data set adjustments for classifier

words = set(w2v_model.wv.index_to_key )
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_train])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_test])

# averaging words into sentence

X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(vector_size, dtype=float))
        
X_test_vect_avg = []

for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(vector_size, dtype=float))

  and should_run_async(code)


ValueError: ignored

In [None]:
np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_train]).shape

  and should_run_async(code)


ValueError: ignored

In [None]:
# random forest classification
clf = RandomForestClassifier(max_depth=max_depth, random_state=0)
clf.fit(X_train_vect_avg, np.ravel(y_train))
y_pred = clf.predict(X_test_vect_avg)
accuracy_score(y_test, y_pred)

0.710267229254571