# Tasks
- Check for 3 models which number of features for TfIdf will be sufficient with respect to number of ngrams
    - Save Acc, Prec, Rec and F1
    - Save the result
- For given max features and ngrams run TPOT Classifier
- Write a pipeline which for at least 10 models search for optimal parameters for such data preprocessors:
    - Bag of Words
    - TfIdf
    - Normal tokens
    - Lemmatized tokens
    - LabelBinarize
    - OneHotEncoding
- Try to improve best of 3 models via hyperparameter tuning and dimensionality reduction
- Use ensemble methods like 
    - stacking classifier
    - voting classifier
- Write and evaluate a simple neural network with embedding tensorflow and keras tuner 
- Use word2vec to your neural network
- Calculate additional features from NER, POS, Sentiment analysis, KMeans, Topic modelling
- Write a neural network which accpets new inputs

# Setup and Data Import

In [135]:
import pandas as pd
import numpy as np
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb

In [69]:
path_to_data = os.path.join('..', 'data', 'interim', 'prepared_data_Kamil.csv')

In [70]:
drop_columns = ['language', 'lyrics']
unstructured_columns = ['tokens', 'entities', 'pos_tokens', 'tokens_lemma']
df = pd.read_csv(path_to_data)

# Drop unused columns
df.drop(drop_columns, axis=1, inplace=True)

# Converting columns with lists/dicts to usable structure
for column in unstructured_columns:
    df[column] = df[column].apply(eval)

In [71]:
print(df['entities'][0])

{'evening': 'TIME', 'Tryna': 'PERSON', 'Cause': 'ORG'}


In [72]:
type(df['entities'][0])

dict

In [73]:
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'genre'], df['genre'],
                                                    random_state=7, stratify=df['genre'], test_size=0.2)

# Checking for optimal Data Preprocessing params

In [146]:
X_temp = X_train.head(50)['tokens_lemma']
X_temp = X_temp.apply(lambda x: ' '.join(x))
y_temp = y_train.head(50)

onehot= LabelEncoder()

y_temp = onehot.fit_transform(y_temp)

In [147]:
param_grid = {'ngram_range': [(1, 1), (1, 2)],
              'max_features': [30, 40]}

models = {'Random Forest': RandomForestClassifier(max_depth=30),
          'XGBoost Classifier': xgb.XGBClassifier(max_depth=30),
          'Gaussian Naive Bayes': GaussianNB()}

cv_kwargs = {'cv': 2,
             'n_jobs': -1}

In [159]:
def find_optimal_tfidf_params(X, y, param_grid, models, 
                              cv_kwargs=None, random_state=7):
    
    
    np.random.seed(random_state)
    
    scoring = ['accuracy', 'recall_weighted', 'precision_weighted', 'f1_weighted']
    
    params_combinations = [{'ngram_range': ngram, 'max_features': max_feature}
                          for ngram in param_grid['ngram_range'] 
                          for max_feature in param_grid['max_features']]
    results = _create_results()
    for params in params_combinations:
        
        tfidf = TfidfVectorizer(**params)
        X_tfidf = tfidf.fit_transform(X).toarray()
        
        for name, model in models.items():
            cv = cross_validate(model, X_tfidf, y, scoring=scoring, **cv_kwargs)
            round_cv_results(cv)
            temp = cv_to_dataframe(cv, name, **params)
            results = results.append(temp, ignore_index=True)
    return results
        
def _create_results():    
    return pd.DataFrame({'Name': [],
                         'max_features': [],
                         'ngram_range': [],
                         'Accuracy': [],
                         'Recall': [],
                         'Precision': [],
                         'F1': []})

def cv_to_dataframe(cv, name, max_features=None, ngram_range=None):
    return pd.DataFrame({'Name': [name],
                         'max_features': [max_features],
                         'ngram_range': [ngram_range],
                         'Accuracy': [cv['test_accuracy']],
                         'Recall': [cv['test_recall_weighted']],
                         'Precision': [cv['test_precision_weighted']],
                         'F1': [cv['test_f1_weighted']]})

def round_cv_results(cv):
    for key in ['test_accuracy', 'test_recall_weighted', 'test_precision_weighted', 'test_f1_weighted']:
        values = cv[key]
        cv[key] = [np.round(i, 2) for i in values]

In [160]:
results = find_optimal_tfidf_params(X_temp, y_temp, param_grid, models, cv_kwargs = cv_kwargs)

In [161]:
results

Unnamed: 0,Name,max_features,ngram_range,Accuracy,Recall,Precision,F1
0,Random Forest,30.0,"(1, 1)","[0.12, 0.16]","[0.12, 0.16]","[0.1, 0.08]","[0.1, 0.11]"
1,XGBoost Classifier,30.0,"(1, 1)","[0.08, 0.08]","[0.08, 0.08]","[0.07, 0.06]","[0.07, 0.06]"
2,Gaussian Naive Bayes,30.0,"(1, 1)","[0.12, 0.2]","[0.12, 0.2]","[0.07, 0.14]","[0.08, 0.14]"
3,Random Forest,40.0,"(1, 1)","[0.16, 0.12]","[0.16, 0.12]","[0.05, 0.05]","[0.08, 0.07]"
4,XGBoost Classifier,40.0,"(1, 1)","[0.08, 0.16]","[0.08, 0.16]","[0.06, 0.1]","[0.07, 0.12]"
5,Gaussian Naive Bayes,40.0,"(1, 1)","[0.16, 0.28]","[0.16, 0.28]","[0.22, 0.16]","[0.17, 0.17]"
6,Random Forest,30.0,"(1, 2)","[0.12, 0.04]","[0.12, 0.04]","[0.05, 0.03]","[0.07, 0.04]"
7,XGBoost Classifier,30.0,"(1, 2)","[0.16, 0.12]","[0.16, 0.12]","[0.14, 0.12]","[0.14, 0.12]"
8,Gaussian Naive Bayes,30.0,"(1, 2)","[0.08, 0.24]","[0.08, 0.24]","[0.06, 0.14]","[0.06, 0.15]"
9,Random Forest,40.0,"(1, 2)","[0.16, 0.04]","[0.16, 0.04]","[0.1, 0.02]","[0.12, 0.02]"


from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB


- logistic regerssion
- random forest
- xgboost
- sgd class
- svc
- knn
- gaussiannb
- adaboost
- MLP

In [64]:
model = LogisticRegression()
model.fit(X, y_train)
model.score(X, y_train)

0.29176610978520284

TypeError: score() missing 2 required positional arguments: 'X' and 'y'

In [30]:
df.head()

Unnamed: 0,artist_name,track_name,popularity,genre,tokens,entities,pos_tokens,tokens_lemma
0,James Bay,Let It Go,73,rock,"[walking, home, talking, loads, seeing, shows,...","{'evening': 'TIME', 'Tryna': 'PERSON', 'Cause'...","[(walking, v), (home, n), (talking, v), (loads...","[walk, home, talk, load, see, show, even, clot..."
1,Bonobo,From You,67,jazz,"[gone, like, changing, seasons, alright, alrig...",{},"[(gone, v), (like, n), (changing, v), (seasons...","[go, like, change, season, alright, alright, s..."
2,Lee Brice,One Of Them Girls,69,country,"[one, girls, peels, bud, light, label, might, ...","{'Kinda': 'PERSON', 'one': 'CARDINAL', 'all ni...","[(one, n), (girls, n), (peels, n), (bud, v), (...","[one, girl, peel, bud, light, label, might, ru..."
3,Andy Gibb,I Just Want To Be Your Everything,62,disco,"[long, finding, long, feeling, feel, strong, g...",{'Build': 'FAC'},"[(long, r), (finding, v), (long, r), (feeling,...","[long, find, long, feel, feel, strong, girl, t..."
4,"Earth, Wind & Fire",You Want My Love,61,jazz,"[got, ta, say, much, tell, love, means, someth...","{'Don': 'PERSON', 'Lies': 'PERSON'}","[(got, v), (ta, n), (say, v), (much, r), (tell...","[get, ta, say, much, tell, love, mean, somethi..."
