In [6]:
import pandas as pd
valid_df = pd.read_csv("valid.csv")
train_df = pd.read_csv("train.csv")
train_df.head()
train_df.info()
train_df['label'].value_counts()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21464 entries, 0 to 21463
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    21464 non-null  object
 1   label   21464 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 335.5+ KB


0    11248
1    10216
Name: label, dtype: int64

In [7]:
train_df['text_length'] = train_df['text'].astype(str).apply(len)
train_df['text_length'].describe()

count    21464.000000
mean        62.252656
std         20.913449
min          7.000000
25%         49.000000
50%         62.000000
75%         74.000000
max        926.000000
Name: text_length, dtype: float64

In [8]:
train_df.nlargest(5, 'text_length')[['text_length','text']]

Unnamed: 0,text_length,text
2041,926,hot wheels ranked number one toy for rolling d...
8312,254,"maya angelou, poet, author, civil rights activ..."
12187,238,"'12 years a slave,' 'captain phillips,' 'ameri..."
9210,237,"elmore leonard, modern prose master, noted for..."
17882,228,occasionally you realize someone you thought w...


In [9]:
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

# word level TF-IDF
word_tfidf = TfidfVectorizer(
    lowercase=True,
    ngram_range=(1,2),   # defualt values, adjust latter in GridSearch
    min_df=1,
    max_df=0.95,
    stop_words=None
)

# char level TF-IDF (new)
char_tfidf = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3,5),   # char 3-gram ~ 5-gram
    min_df=1
)

# combine
feature_union = FeatureUnion([
    ("word", word_tfidf),
    ("char", char_tfidf),
])

# new baseline word+char TF-IDF â†’ SVM
baseline_char = Pipeline([
    ("vect", feature_union),
    ("clf", LinearSVC())
])

In [10]:
from sklearn.model_selection import GridSearchCV

params_char = {
    "vect__word__ngram_range": [(1,1), (1,2)],
    "vect__word__min_df": [1, 2],
    "clf__C": [1, 3, 5]
    
}

grid_char = GridSearchCV(
    baseline_char,
    param_grid=params_char,
    scoring="f1_macro",
    cv=3,
    n_jobs=-1
)

grid_char.fit(train_df["text"], train_df["label"])
grid_char.best_params_, grid_char.best_score_

({'clf__C': 5, 'vect__word__min_df': 1, 'vect__word__ngram_range': (1, 2)},
 0.8682935138189206)

In [11]:
results_char = pd.DataFrame(grid_char.cv_results_)

results_char = results_char[[
    "param_vect__word__ngram_range",
    "param_vect__word__min_df",
    "param_clf__C",
    "mean_test_score",
    "std_test_score",
    "rank_test_score"
]]

results_char = results_char.sort_values("rank_test_score")

results_char

Unnamed: 0,param_vect__word__ngram_range,param_vect__word__min_df,param_clf__C,mean_test_score,std_test_score,rank_test_score
9,"(1, 2)",1,5,0.868294,0.00271,1
5,"(1, 2)",1,3,0.867925,0.002992,2
1,"(1, 2)",1,1,0.867834,0.002152,3
3,"(1, 2)",2,1,0.863194,0.002028,4
0,"(1, 1)",1,1,0.861087,0.001613,5
7,"(1, 2)",2,3,0.859455,0.003324,6
11,"(1, 2)",2,5,0.858182,0.002136,7
2,"(1, 1)",2,1,0.857313,0.001939,8
4,"(1, 1)",1,3,0.85606,0.001528,9
8,"(1, 1)",1,5,0.854512,0.001121,10


In [12]:
full_df = pd.concat([train_df, valid_df], ignore_index=True)

final_model_char = grid_char.best_estimator_
final_model_char.fit(full_df["text"], full_df["label"])

Pipeline(steps=[('vect',
                 FeatureUnion(transformer_list=[('word',
                                                 TfidfVectorizer(max_df=0.95,
                                                                 ngram_range=(1,
                                                                              2))),
                                                ('char',
                                                 TfidfVectorizer(analyzer='char',
                                                                 ngram_range=(3,
                                                                              5)))])),
                ('clf', LinearSVC(C=5))])

In [14]:
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

test_df = pd.read_csv("test.csv")

test_pred_char = final_model_char.predict(test_df["text"])

print(classification_report(test_df["label"], test_pred_char))
print(confusion_matrix(test_df["label"], test_pred_char))

              precision    recall  f1-score   support

           0       0.88      0.90      0.89       526
           1       0.88      0.85      0.87       440

    accuracy                           0.88       966
   macro avg       0.88      0.88      0.88       966
weighted avg       0.88      0.88      0.88       966

[[474  52]
 [ 64 376]]
