In [50]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, precision_recall_fscore_support, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import joblib
import seaborn as sns

In [9]:
data = pd.read_csv("../data/train_data.csv", header = None)

In [10]:
data.head()

Unnamed: 0,0,1
0,yew hedge,1221
1,fire detection shop,19
2,cheap couch roll,398
3,extra watermelon gum,1108
4,used generators for sale uk,213


In [30]:
data_sample = data.sample(frac=0.05)

In [31]:
# Create the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC(class_weight='balanced')),
])

In [32]:
# Define the hyperparameters for grid search
parameters = {
    'tfidf__max_features': [500, 1000, 10000],  # Maximum number of features
    'tfidf__ngram_range': [(1, 1), (1, 2)],  # Unigrams or bigrams
    'clf__C': [0.1, 1, 10]  # Regularization parameter
}

In [33]:
# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(pipeline, parameters, scoring='f1_macro', cv=3, n_jobs=-1, verbose = 5)
grid_search.fit(data_sample[0], data_sample[1])

Fitting 3 folds for each of 18 candidates, totalling 54 fits




[CV 1/3] END clf__C=0.1, tfidf__max_features=500, tfidf__ngram_range=(1, 1);, score=0.089 total time=  30.5s
[CV 2/3] END clf__C=0.1, tfidf__max_features=500, tfidf__ngram_range=(1, 2);, score=0.083 total time=  31.5s
[CV 1/3] END clf__C=0.1, tfidf__max_features=1000, tfidf__ngram_range=(1, 2);, score=0.152 total time=  32.1s
[CV 2/3] END clf__C=0.1, tfidf__max_features=10000, tfidf__ngram_range=(1, 1);, score=0.323 total time=  35.9s
[CV 3/3] END clf__C=0.1, tfidf__max_features=10000, tfidf__ngram_range=(1, 2);, score=0.325 total time=  42.6s
[CV 1/3] END clf__C=1, tfidf__max_features=500, tfidf__ngram_range=(1, 2);, score=0.102 total time= 1.4min
[CV 2/3] END clf__C=1, tfidf__max_features=1000, tfidf__ngram_range=(1, 1);, score=0.180 total time= 1.0min
[CV 3/3] END clf__C=1, tfidf__max_features=1000, tfidf__ngram_range=(1, 2);, score=0.173 total time= 1.1min
[CV 1/3] END clf__C=1, tfidf__max_features=10000, tfidf__ngram_range=(1, 2);, score=0.333 total time=  51.3s
[CV 2/3] END clf__



[CV 3/3] END clf__C=0.1, tfidf__max_features=500, tfidf__ngram_range=(1, 1);, score=0.085 total time=  30.9s
[CV 3/3] END clf__C=0.1, tfidf__max_features=500, tfidf__ngram_range=(1, 2);, score=0.082 total time=  31.6s
[CV 3/3] END clf__C=0.1, tfidf__max_features=1000, tfidf__ngram_range=(1, 2);, score=0.153 total time=  32.5s
[CV 1/3] END clf__C=0.1, tfidf__max_features=10000, tfidf__ngram_range=(1, 2);, score=0.324 total time=  43.1s
[CV 2/3] END clf__C=1, tfidf__max_features=500, tfidf__ngram_range=(1, 1);, score=0.104 total time= 1.4min
[CV 3/3] END clf__C=1, tfidf__max_features=500, tfidf__ngram_range=(1, 2);, score=0.100 total time= 1.4min
[CV 1/3] END clf__C=1, tfidf__max_features=1000, tfidf__ngram_range=(1, 2);, score=0.177 total time= 1.1min
[CV 2/3] END clf__C=1, tfidf__max_features=10000, tfidf__ngram_range=(1, 1);, score=0.350 total time=  36.8s
[CV 3/3] END clf__C=1, tfidf__max_features=10000, tfidf__ngram_range=(1, 2);, score=0.336 total time=  50.4s
[CV 1/3] END clf__C=1



[CV 1/3] END clf__C=0.1, tfidf__max_features=500, tfidf__ngram_range=(1, 2);, score=0.085 total time=  31.3s
[CV 2/3] END clf__C=0.1, tfidf__max_features=1000, tfidf__ngram_range=(1, 1);, score=0.158 total time=  31.0s
[CV 2/3] END clf__C=0.1, tfidf__max_features=1000, tfidf__ngram_range=(1, 2);, score=0.151 total time=  32.3s
[CV 3/3] END clf__C=0.1, tfidf__max_features=10000, tfidf__ngram_range=(1, 1);, score=0.326 total time=  35.6s
[CV 1/3] END clf__C=1, tfidf__max_features=500, tfidf__ngram_range=(1, 1);, score=0.104 total time= 1.4min
[CV 2/3] END clf__C=1, tfidf__max_features=500, tfidf__ngram_range=(1, 2);, score=0.100 total time= 1.4min
[CV 3/3] END clf__C=1, tfidf__max_features=1000, tfidf__ngram_range=(1, 1);, score=0.179 total time= 1.1min
[CV 1/3] END clf__C=1, tfidf__max_features=10000, tfidf__ngram_range=(1, 1);, score=0.353 total time=  35.1s
[CV 2/3] END clf__C=1, tfidf__max_features=10000, tfidf__ngram_range=(1, 2);, score=0.335 total time=  51.6s
[CV 3/3] END clf__C=



In [36]:
grid_search.best_params_
best_params = {'clf__C': 1, 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 1)}

# train the best model again
best_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=best_params['tfidf__max_features'], ngram_range=best_params['tfidf__ngram_range'])),
    ('clf', LinearSVC(C=best_params['clf__C'], class_weight ='balanced'))
])

In [37]:
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [38]:
best_pipeline.fit(train[0], train[1])

In [40]:
test['predicted_intents'] = best_pipeline.predict(test[0])

In [43]:
precision_recall_fscore_support(test[1] , test['predicted_intents'] , average='weighted')

(0.6203885524436544, 0.6277262802290611, 0.6200204800988347, None)

In [52]:
len(test)

121365

In [53]:
best_params = {'clf__C': 1, 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 1)}

# train the best model again
final_best_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=best_params['tfidf__max_features'], ngram_range=best_params['tfidf__ngram_range'])),
    ('clf', LinearSVC(C=best_params['clf__C'], class_weight ='balanced'))
])

final_best_pipeline.fit(data[0], data[1])

In [58]:
validation_data = pd.read_csv("../data/test_data.csv", header=None)

In [59]:
validation_data.head()

Unnamed: 0,0
0,twister picnic blanket
1,best stop smoking app
2,phosphorus fertiliser
3,tattoo books
4,child's desk chair


In [62]:
validation_data['predicted_intents'] = final_best_pipeline.predict(validation_data[0])

In [63]:
validation_data.to_csv("../output/test_data_inference.csv")

In [65]:
joblib.dump(final_best_pipeline, '../models/TfIdfClassifier.joblib', compress = 1)

['../models/TfIdfClassifier.joblib']