In [9]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import re

In [4]:
def read_data():
    train = pd.read_csv('../data/train_E6oV3lV.csv')
    test = pd.read_csv('../data/test_tweets_anuFYb8.csv')
    # train = train.head(10)
    # test = test.head(10)
    print("train size:", train.shape)
    print('test size:', test.shape)
    return train, test


train, test = read_data()

train size: (31962, 3)
test size: (17197, 2)


In [5]:
train.label.value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [6]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [10]:
def preprocess(text):
    text = re.sub(r'[^a-zA-Z\']', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower().strip()
    return text

In [13]:
x=preprocess(train.tweet[0])
x

'user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction run'

In [15]:
train['tweet'] = train['tweet'].apply(lambda x: preprocess(x))

In [16]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,user when a father is dysfunctional and is so ...
1,2,0,user user thanks for lyft credit i can't use c...
2,3,0,bihday your majesty
3,4,0,model i love u take with u all the time in ur
4,5,0,factsguide society now motivation


In [14]:
# def preprocess_data(vec):
#     vec = vec.replace('[', '').replace(']', '')
#     vec = vec.split(', ')
#     vec = [float(j) for j in vec]
#     return vec

In [41]:
from nltk.corpus import stopwords
import nltk
from string import punctuation
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import wordpunct_tokenize

In [67]:
# sentences=[[word for word in sent if word not in spe] for sent in sentences]
# sentences[0]

In [65]:
def preprocess2(text):
    spe = stopwords.words('english')+list(punctuation)
    text=' '.join([word for word in text.split() if word not in spe])
    return text


In [68]:
train['tweet'] = train['tweet'].apply(lambda x: preprocess2(x))
train.head()

Unnamed: 0,id,label,tweet
0,1,0,user father dysfunctional selfish drags kids d...
1,2,0,user user thanks lyft credit can't use cause o...
2,3,0,bihday majesty
3,4,0,model love u take u time ur
4,5,0,factsguide society motivation


In [69]:
test['tweet'] = test['tweet'].apply(lambda x: preprocess(x))
test['tweet'] = test['tweet'].apply(lambda x: preprocess2(x))
test.head()

Unnamed: 0,id,tweet
0,31963,studiolife aislife requires passion dedication...
1,31964,user white supremacists want everyone see new ...
2,31965,safe ways heal acne altwaystoheal healthy healing
3,31966,hp cursed child book reservations already yes ...
4,31967,rd bihday amazing hilarious nephew eli ahmir u...


In [84]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer


count_vect = CountVectorizer()
tfidf_transformer=TfidfTransformer()

x_train = count_vect.fit_transform(train['tweet'].tolist())  
x_test = count_vect.transform(test['tweet'].tolist())


x_train = tfidf_transformer.fit_transform(x_train)
x_test = tfidf_transformer.fit_transform(x_test)

y_train = train['label']

In [85]:
print("x_train:",x_train.shape)
print("x_test:",x_test.shape)
print("y_train:",y_train.shape)

x_train: (31962, 37451)
x_test: (17197, 37451)
y_train: (31962,)


In [86]:
print(x_train[2])

  (0, 19934)	0.8811601339763669
  (0, 3336)	0.47281795470429333


In [88]:
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
df = pd.DataFrame()
df['id'] = test['id']
df['label'] = y_pred
df.to_csv('../data/submission1.csv', index=False)
df.head()

Unnamed: 0,id,label
0,31963,0
1,31964,0
2,31965,0
3,31966,0
4,31967,0


In [89]:
df.label.value_counts()

0    16576
1      621
Name: label, dtype: int64

In [4]:
def create_train_val_data(train, test):
    train['tweet'] = train['tweet'].apply(lambda a: preprocess_data(a))
    test['tweet'] = test['tweet'].apply(lambda a: preprocess_data(a))

    X = train['tweet']
    Y = train['label']
    X = [np.array(i) for i in X]
    X = np.array(X)
    Y = np.array(list(Y))
    X_test = test['tweet']
    X_test = [np.array(i) for i in X_test]
    X_test = np.array(X_test)

    X_tr, X_val, Y_tr, Y_val = train_test_split(X, Y, test_size=0.33, random_state=42)

    print(X_tr.shape)
    print(Y_tr.shape)
    print(X_val.shape)
    print(Y_val.shape)
    print(X_test.shape)

    return X, Y, X_tr, Y_tr, X_val, Y_val, X_test


X, Y, X_tr, Y_tr, X_val, Y_val, X_test = create_train_val_data(train, test)

(21414, 768)
(21414,)
(10548, 768)
(10548,)
(17197, 768)


In [6]:
# create models
models = {'knn': KNeighborsClassifier(), 'logit': LogisticRegression(), 'svm': SVC()}

for model, call in models.items():
    print(f'Training {model}')
    call.fit(X_tr, Y_tr)
    pred = call.predict(X_val)
    print(accuracy_score(pred, Y_val))
    print(confusion_matrix(pred,Y_val))
    print(classification_report(pred,Y_val))
    print()


Training knn
0.9484262419416003
[[9677  415]
 [ 129  327]]
              precision    recall  f1-score   support

           0       0.99      0.96      0.97     10092
           1       0.44      0.72      0.55       456

    accuracy                           0.95     10548
   macro avg       0.71      0.84      0.76     10548
weighted avg       0.96      0.95      0.95     10548


Training logit


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9488054607508533
[[9635  369]
 [ 171  373]]
              precision    recall  f1-score   support

           0       0.98      0.96      0.97     10004
           1       0.50      0.69      0.58       544

    accuracy                           0.95     10548
   macro avg       0.74      0.82      0.78     10548
weighted avg       0.96      0.95      0.95     10548


Training svm
0.9510807736063709
[[9750  460]
 [  56  282]]
              precision    recall  f1-score   support

           0       0.99      0.95      0.97     10210
           1       0.38      0.83      0.52       338

    accuracy                           0.95     10548
   macro avg       0.69      0.89      0.75     10548
weighted avg       0.97      0.95      0.96     10548




In [6]:
model = SVC()
model.fit(X, Y)
y_pred = model.predict(X_test)
df = pd.DataFrame()
df['id'] = test['id']
df['label'] = y_pred
df.to_csv('../data/submission.csv', index=False)

In [7]:
get_embeddings([x])


[[0.11017496138811111,
  0.05034864693880081,
  -0.25847572088241577,
  -0.929716169834137,
  0.41023534536361694,
  -0.17932453751564026,
  -0.01990448124706745,
  0.9529711008071899,
  0.22372391819953918,
  -0.09859032928943634,
  0.423568993806839,
  -0.19547376036643982,
  0.032352086156606674,
  0.45575568079948425,
  -0.34351199865341187,
  0.2789958715438843,
  0.17035414278507233,
  0.5755972266197205,
  -0.1527310013771057,
  0.4060811400413513,
  0.0371258482336998,
  0.036854859441518784,
  -0.09053368866443634,
  0.29525068402290344,
  0.033942773938179016,
  0.6658225655555725,
  0.015101602301001549,
  0.5173330903053284,
  -0.2533319592475891,
  0.13837438821792603,
  0.006637370213866234,
  0.08168821781873703,
  0.34122568368911743,
  0.20935456454753876,
  0.312744677066803,
  -1.058285117149353,
  -0.4379790723323822,
  0.486817330121994,
  -0.15480440855026245,
  0.038146521896123886,
  -0.3987184762954712,
  -0.5787211656570435,
  -0.021512895822525024,
  -0.86808

In [None]:
test=pd.read_csv('../data/test_tweets_anuFYb8.csv')
print(test.shape)
test.head()

In [None]:
# x_train=get_embeddings()
tweets=train['tweet'].to_list()
tweets=[preprocess(i) for i in tweets]
x_train=get_embeddings(tweets)