In [1]:
import json
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from math import log
import time
import warnings

In [2]:
VOCAB_SIZE = 5000

# Exploritory Data analysis 

In [3]:
d1 = pd.read_json(path_or_buf="./data/domain1_train.json/domain1_train.json", lines=True)

In [4]:
d1.iloc[0]

text     [70, 746, 825, 109, 2083, 0, 2, 0, 0, 0, 9, 0,...
label                                                    1
Name: 0, dtype: object

In [5]:
d1['text'][0]

[70, 746, 825, 109, 2083, 0, 2, 0, 0, 0, 9, 0, 1004, 19, 0, 0, 7, 913]

In [6]:
d1['label'][0]

1

In [7]:
d1.head()

Unnamed: 0,text,label
0,"[70, 746, 825, 109, 2083, 0, 2, 0, 0, 0, 9, 0,...",1
1,"[1209, 179, 1952, 4, 4959, 7, 0, 2, 978, 1522,...",1
2,"[287, 3, 3330, 0, 23, 12, 13, 465, 74, 8, 0, 8...",1
3,"[0, 0, 3, 592, 19, 2, 706, 1439, 2575, 7, 2, 0...",1
4,"[9, 2, 110, 12, 42, 32, 44, 361, 9, 3860, 2358...",1


In [8]:
texts_all_d1 = d1["text"]

In [9]:
counter_all_d1 = Counter()

In [10]:
for text in texts_all_d1:
    for word in text:
        counter_all_d1[word] +=1

In [11]:
len(counter_all_d1)

4926

In [12]:
texts_human_d1 = d1[d1["label"] == 1]["text"]
counter_human_d1 = Counter()
for text in texts_human_d1:
    for word in text:
        counter_human_d1[word] +=1
len(counter_human_d1)

4836

In [13]:
d1["text"].apply(len).max()

238

In [None]:
texts_human_d1.apply(len).mean()

In [None]:
texts_ai_d1 = d1[d1["label"] == 0]["text"]
counter_ai_d1 = Counter()
for text in texts_ai_d1:
    for word in text:
        counter_ai_d1[word] +=1
len(counter_ai_d1)

In [None]:
texts_ai_d1.apply(len).mean()

In [15]:
d2 = pd.read_json(path_or_buf="./data/domain2_train.json/domain2_train.json", lines=True)

In [None]:
texts_all_d2 = d2["text"]
counter_all_d2 = Counter()
for text in texts_all_d2:
    for word in text:
        counter_all_d2[word] +=1
len(counter_all_d2)

In [None]:
texts_human_d2 = d2[d2["label"] == 1]["text"]
counter_human_d2 = Counter()
for text in texts_human_d2:
    for word in text:
        counter_human_d2[word] +=1
len(counter_human_d2)

In [None]:
len(texts_human_d2)

In [None]:
texts_human_d2.apply(len).mean()

In [None]:
texts_human_d2.apply(len).median()

In [None]:
texts_ai_d2 = d2[d2["label"] == 0]["text"]
counter_ai_d2 = Counter()
for text in texts_ai_d2:
    for word in text:
        counter_ai_d2[word] +=1
len(counter_ai_d2)

In [None]:
texts_ai_d2.apply(len).mean()

In [None]:
len(texts_ai_d2)

In [None]:
zip(*counter_all_d1.most_common())

In [None]:
counter_all_d1.most_common()

In [None]:
testList2_human = [log(elem2) for elem1, elem2 in counter_human_d1.most_common()]
testList2_ai = [log(elem2) for elem1, elem2 in counter_ai_d1.most_common()]

In [None]:
testList2_human

In [None]:
plt.plot(testList2_human)

plt.plot(testList2_ai)

plt.show()

In [None]:
testList_human = [log(elem2*6) for elem1, elem2 in counter_human_d2.most_common()]
testList_ai = [log(elem2) for elem1, elem2 in counter_ai_d2.most_common()]
plt.plot(testList_human)

plt.plot(testList_ai)

plt.show()

In [None]:
len(texts_ai_d2)/len(texts_human_d2)

# Feature Engineering/Selection

In [16]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN

In [None]:
def get_BOW(row):
    text = row['text']
    bow = [0 for i in range(VOCAB_SIZE)]
    for idx in text:
        bow[idx]+=1
    return np.array(bow)

In [17]:
# Define the vocabulary explicitly as a list of integers from 0 to 4999
vocabulary = [str(i) for i in range(VOCAB_SIZE)]

# Initialize the CountVectorizer with the predefined vocabulary
vectorizer = CountVectorizer(vocabulary=vocabulary)

# Fit and transform the text data to obtain a feature matrix
bow_matrix_1 = vectorizer.fit_transform(d1["text"].apply(lambda x: " ".join(map(str, x))))

# Convert the feature matrix to a dense NumPy array if needed
#dense_bow_matrix = bow_matrix.toarray()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(bow_matrix_1, d1['label'], test_size=0.2, random_state=42)

In [None]:
X_train

In [None]:
y_train

In [None]:
bow_matrix_1.shape

In [None]:
vt = VarianceThreshold(threshold=0.001)

In [None]:
#Train a logistic regression model.
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on the test set.
y_pred = model.predict(X_test)

# Evaluate the model.
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(report)

In [None]:
scores = cross_validate(model, bow_matrix_1, d1['label'], cv=5,scoring=('balanced_accuracy', 'f1', 'roc_auc'))
print(scores['test_balanced_accuracy'].mean())
print(scores['test_f1'].mean())
print(scores['test_roc_auc'].mean())

In [None]:
bow_vt_1 = vt.fit_transform(bow_matrix_1)

In [None]:
bow_vt_1.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(bow_vt_1, d1['label'], test_size=0.2, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set.
y_pred = model.predict(X_test)

# Evaluate the model.
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(report)

In [20]:
tfidf = TfidfTransformer()

In [None]:
tfidf_matrix_1 = tfidf.fit_transform(bow_matrix_1)

In [None]:
X_t_train, X_t_test, y_t_train, y_t_test = train_test_split(tfidf_matrix_1, d1['label'], test_size=0.2, random_state=42)
model.fit(X_t_train, y_t_train)

# Predict on the test set.
y_t_pred = model.predict(X_t_test)

# Evaluate the model.
accuracy = accuracy_score(y_t_test, y_t_pred)
report = classification_report(y_t_test, y_t_pred)

print(f"Accuracy: {accuracy}")
print(report)

In [None]:
scores = cross_validate(model, tfidf_matrix_1, d1['label'], cv=5,scoring=('balanced_accuracy', 'f1', 'roc_auc'))
print(scores['test_balanced_accuracy'].mean())
print(scores['test_f1'].mean())
print(scores['test_roc_auc'].mean())

In [None]:
vt1 = VarianceThreshold(threshold=0.0001)
tfidf_vt_1 = vt1.fit_transform(tfidf_matrix_1)

In [None]:
tfidf_vt_1.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_vt_1, d1['label'], test_size=0.2, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set.
y_pred = model.predict(X_test)

# Evaluate the model.
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(report)

In [None]:
bow_matrix_2 = vectorizer.fit_transform(d2["text"].apply(lambda x: " ".join(map(str, x))))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(bow_matrix_2, d2['label'], test_size=0.2, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set.
y_pred = model.predict(X_test)

# Evaluate the model.
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"f1: {f1}")
print(f"roc_auc_score: {roc_auc}")
print(report)

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
# scores = cross_validate(model, bow_matrix_2, d2['label'], cv=5,scoring=('balanced_accuracy', 'f1', 'roc_auc'))
# print(scores['test_balanced_accuracy'].mean())
# print(scores['test_f1'].mean())
# print(scores['test_roc_auc'].mean())

In [19]:
oversample = RandomOverSampler(sampling_strategy='minority')
smote = SMOTE()
bsmote = BorderlineSMOTE()
svmsmote = SVMSMOTE()
adasyn = ADASYN()

In [None]:
def test_oversample(X_train, y_train, X_test, y_test, oversample, model):
    X_train_o, y_train_o = oversample.fit_resample(X_train, y_train)
    model.fit(X_train_o, y_train_o)

    # Predict on the test set.
    y_pred = model.predict(X_test)

    # Evaluate the model.
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    report = classification_report(y_test, y_pred)
    
    print(f"Accuracy: {accuracy}")
    print(f"f1: {f1}")
    print(f"roc_auc_score: {roc_auc}")
    print(report)
    print("-"*40)

In [None]:
test_oversample(X_train, y_train, X_test, y_test, oversample, model)

In [None]:
X_train_o, y_train_o = oversample.fit_resample(X_train, y_train)
model.fit(X_train_o, y_train_o)

# Predict on the test set.
y_pred = model.predict(X_test)

# Evaluate the model.
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(report)

In [None]:
%%time

test_oversample(X_train, y_train, X_test, y_test, smote, model)

In [None]:
%%time

test_oversample(X_train, y_train, X_test, y_test, bsmote, model)

In [None]:
%%time

test_oversample(X_train, y_train, X_test, y_test, svmsmote, model)

In [None]:
%%time

test_oversample(X_train, y_train, X_test, y_test, adasyn, model)

In [None]:
tfidf_matrix_2 = tfidf.fit_transform(bow_matrix_2)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix_2, d2['label'], test_size=0.2, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set.
y_pred = model.predict(X_test)

# Evaluate the model.
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"f1: {f1}")
print(f"roc_auc_score: {roc_auc}")
print(report)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
scores = cross_validate(model, tfidf_matrix_2, d2['label'], cv=5,scoring=('balanced_accuracy', 'f1', 'roc_auc'))
print(scores['test_balanced_accuracy'].mean())
print(scores['test_f1'].mean())
print(scores['test_roc_auc'].mean())

In [None]:
test_oversample(X_train, y_train, X_test, y_test, smote, model)
test_oversample(X_train, y_train, X_test, y_test, bsmote, model)
test_oversample(X_train, y_train, X_test, y_test, svmsmote, model)
test_oversample(X_train, y_train, X_test, y_test, adasyn, model)

In [None]:
tfidf_vt_2 = vt1.fit_transform(tfidf_matrix_2)
X_train, X_test, y_train, y_test = train_test_split(tfidf_vt_2, d2['label'], test_size=0.2, random_state=42)
tfidf_vt_2.shape

In [None]:
test_oversample(X_train, y_train, X_test, y_test, smote, model)
test_oversample(X_train, y_train, X_test, y_test, bsmote, model)
test_oversample(X_train, y_train, X_test, y_test, svmsmote, model)
test_oversample(X_train, y_train, X_test, y_test, adasyn, model)

In [None]:
#pd.Series(X.ravel()).apply(lambda x: " ".join(map(str, x)))

In [21]:
def get_k_val(d1x, d1y, d2x, d2y, k):
    output = []
    
    samples_per_class_in_test = 250  # Adjust as needed

    # Initialize empty lists to store the train and test indices
    train_indices1 = [[] for i in range(k)]
    train_indices2 = [[] for i in range(k)]
    test_indices1 = [[] for i in range(k)]
    test_indices2 = [[] for i in range(k)]
    
    for class_label in [0,1]:
        # Get the indices of samples belonging to the current class
        class_indices1 = np.where(d1y == class_label)[0]
        class_indices2 = np.where(d2y == class_label)[0]

        # Randomly select samples_per_class_in_test samples from this class
        selected_indices1 = np.random.choice(class_indices1, samples_per_class_in_test*k, replace=False)
        selected_indices2 = np.random.choice(class_indices2, samples_per_class_in_test*k, replace=False)
        
        selected_indices1 = selected_indices1.reshape(k, samples_per_class_in_test)
        selected_indices2 = selected_indices2.reshape(k, samples_per_class_in_test)
        
        for i in range(k):
            test_indices1[i].extend(selected_indices1[i])
            test_indices2[i].extend(selected_indices2[i])
            

            # Add the remaining samples to the train set indices
            remaining_indices = np.setdiff1d(class_indices1, selected_indices1[i])
            train_indices1[i].extend(remaining_indices)
            
            remaining_indices = np.setdiff1d(class_indices2, selected_indices2[i])
            train_indices2[i].extend(remaining_indices)
    print(len(train_indices1[0]))
    print(len(train_indices2[0]))
    print(type(d1x[train_indices1[0]]))
    print(type(d1y[train_indices1[0]]))
    for i in range(k):
        # Split the data into train and test sets using the selected indices
        output.append([d1x[train_indices1[i]].append(d2x[train_indices2[i]], ignore_index = True), d1x[test_indices1[i]].append(d2x[test_indices2[i]], ignore_index= True), d1y[train_indices1[i]].append(d2y[train_indices2[i]], ignore_index= True), d1y[test_indices1[i]].append(d2y[test_indices2[i]], ignore_index= True)])
    
    return output, len(train_indices1[0])


In [22]:
data, d1_len = get_k_val(d1['text'], d1['label'], d2['text'], d2['label'], 5)

19000
14400
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [23]:
d1_len

19000

In [24]:
#feature_data = get_k_val(bow_matrix_1.toarray, d1['label'], bow_matrix_2, d2['label'], 5)

In [25]:
X_train, X_test, y_train, y_test = data[0]

In [None]:
X_train.shape

In [None]:
X_train[19000:].shape

In [None]:
X_train_2 = X_train[19000:]
y_train_2 = y_train[19000:]

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
y_train.sum()

In [None]:
y_test.sum()

In [26]:
bow_matrix_train = vectorizer.fit_transform(X_train.apply(lambda x: " ".join(map(str, x))))
bow_matrix_test = vectorizer.transform(X_test.apply(lambda x: " ".join(map(str, x))))

tfidf_matrix_train = tfidf.fit_transform(bow_matrix_train)
tfidf_matrix_test = tfidf.transform(bow_matrix_test)

In [None]:
bow_matrix_train.shape

In [None]:
bow_matrix_train[:19000].shape

In [None]:
y_train[:19000].shape

In [None]:
bow_matrix_train_2 = bow_matrix_train[19000:]
y_train_2 = y_train[19000:]

In [None]:
bow_matrix_train_2_o, y_train_2_o = oversample.fit_resample(bow_matrix_train_2, y_train_2)

In [None]:
bow_matrix_train_2_o.shape

In [None]:
y_train_2_o.shape

In [30]:
import scipy as sp

In [29]:
import scipy as sp
X_train_o = sp.sparse.vstack((bow_matrix_train[:19000], bow_matrix_train_2_o))

NameError: name 'bow_matrix_train_2_o' is not defined

In [None]:
type(bow_matrix_train)

In [None]:
type(X_train_o)

In [None]:
y_train_o = np.append(y_train[:19000], y_train_2_o)

In [None]:
model = LogisticRegression(class_weight='balanced')

In [None]:
model.fit(X_train_o, y_train_o)

In [None]:
# Predict on the test set.
y_pred = model.predict(bow_matrix_test)

# Evaluate the model.
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict_proba(bow_matrix_test)[:, 1])
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"f1: {f1}")
print(f"roc_auc_score: {roc_auc}")
print(report)

In [None]:
print(bow_matrix_train.shape)
print(bow_matrix_test.shape)
print(tfidf_matrix_train.shape)
print(tfidf_matrix_test.shape)

In [None]:
def evaluate(X_train, y_train, X_test, y_test, model, d1_len, oversample = None):
    if oversample is not None:
        
        X_train_2_o, y_train_2_o = oversample.fit_resample(X_train[d1_len:], y_train[d1_len:])
        X_train_o = sp.sparse.vstack((X_train[:d1_len], X_train_2_o))
        y_train_o = np.append(y_train[:d1_len], y_train_2_o)
        
        model.fit(X_train_o, y_train_o)
    else:
        model.fit(X_train, y_train)
    
    # Predict on the test set.
    y_pred = model.predict(X_test)

    # Evaluate the model.
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    report = classification_report(y_test, y_pred)

    print(f"Accuracy: {accuracy}")
    print(f"f1: {f1}")
    print(f"roc_auc_score: {roc_auc}")
    print(report)
    print("-"*50)
    # Predict on the test set.
    print('Domain1')
    y_pred = model.predict(X_test[:500])
    # Evaluate the model.
    accuracy = accuracy_score(y_test[:500], y_pred)
    f1 = f1_score(y_test[:500], y_pred)
    roc_auc = roc_auc_score(y_test[:500], model.predict_proba(X_test[:500])[:, 1])
    report = classification_report(y_test[:500], y_pred)

    print(f"Accuracy: {accuracy}")
    print(f"f1: {f1}")
    print(f"roc_auc_score: {roc_auc}")
    print(report)
    # Predict on the test set.
    print("-"*50)
    print("Domain 2")
    
    y_pred = model.predict(X_test[500:])
    # Evaluate the model.
    accuracy = accuracy_score(y_test[500:], y_pred)
    f1 = f1_score(y_test[500:], y_pred)
    roc_auc = roc_auc_score(y_test[500:], model.predict_proba(X_test[500:])[:, 1])
    report = classification_report(y_test[500:], y_pred)

    print(f"Accuracy: {accuracy}")
    print(f"f1: {f1}")
    print(f"roc_auc_score: {roc_auc}")
    print(report)
    print("-"*50)

In [None]:
evaluate(bow_matrix_train, y_train, bow_matrix_test, y_test, model, d1_len)

In [None]:
print("oversample")
evaluate(bow_matrix_train, y_train, bow_matrix_test, y_test, model, d1_len, oversample)
print("smote")
evaluate(bow_matrix_train, y_train, bow_matrix_test, y_test, model, d1_len, smote)
print("bsmote")
evaluate(bow_matrix_train, y_train, bow_matrix_test, y_test, model, d1_len, bsmote)
print("svmsmote")
evaluate(bow_matrix_train, y_train, bow_matrix_test, y_test, model, d1_len, svmsmote)
print("adasyn")
evaluate(bow_matrix_train, y_train, bow_matrix_test, y_test, model, d1_len, adasyn)

In [None]:
evaluate(tfidf_matrix_train, y_train, tfidf_matrix_test, y_test, model, d1_len)

In [None]:
print("oversample")
evaluate(tfidf_matrix_train, y_train, tfidf_matrix_test, y_test, model, d1_len, oversample)
print("smote")
evaluate(tfidf_matrix_train, y_train, tfidf_matrix_test, y_test, model, d1_len, smote)
print("bsmote")
evaluate(tfidf_matrix_train, y_train, tfidf_matrix_test, y_test, model, d1_len, bsmote)
print("svmsmote")
evaluate(tfidf_matrix_train, y_train, tfidf_matrix_test, y_test, model, d1_len, svmsmote)
print("adasyn")
evaluate(tfidf_matrix_train, y_train, tfidf_matrix_test, y_test, model, d1_len, adasyn)

# Model Selection

In [None]:
from sklearn.utils import class_weight
from sklearn.linear_model import RidgeClassifier
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(d2['label']), y=d2['label'])

In [None]:
model = LogisticRegression(class_weight='balanced')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix_2, d2['label'], test_size=0.2, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set.
y_pred = model.predict(X_test)

# Evaluate the model.
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"f1: {f1}")
print(f"roc_auc_score: {roc_auc}")
print(report)

In [None]:
test_oversample(X_train, y_train, X_test, y_test, smote, model)
test_oversample(X_train, y_train, X_test, y_test, bsmote, model)
test_oversample(X_train, y_train, X_test, y_test, svmsmote, model)
test_oversample(X_train, y_train, X_test, y_test, adasyn, model)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_vt_2, d2['label'], test_size=0.2, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set.
y_pred = model.predict(X_test)

# Evaluate the model.
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"f1: {f1}")
print(f"roc_auc_score: {roc_auc}")
print(report)

In [None]:
test_oversample(X_train, y_train, X_test, y_test, smote, model)
test_oversample(X_train, y_train, X_test, y_test, bsmote, model)
test_oversample(X_train, y_train, X_test, y_test, svmsmote, model)
test_oversample(X_train, y_train, X_test, y_test, adasyn, model)

In [None]:
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights

In [None]:
evaluate(bow_matrix_train, y_train, bow_matrix_test, y_test, model)

In [None]:
evaluate(tfidf_matrix_train, y_train, tfidf_matrix_test, y_test, model)

In [None]:

warnings.filterwarnings("ignore")
param={'solver': ['lbfgs', 'sag','newton-cholesky'],
       'c':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100]}
best_score = 0
best_solver = ''
best_c = 0
for s in param['solver']:
    for c in param['c']:
        lr = LogisticRegression(C = c, solver = s, class_weight='balanced')
        lr.fit(bow_matrix_train, y_train)
        result = lr.score(bow_matrix_test, y_test)
        print("Solver: "+str(s)+", c: "+str(c)+", accurancy: " + str(result))
        if result > best_score:
            best_score = result
            best_solver = s
            best_c = c
lr =LogisticRegression(C = best_c, solver = best_solver, class_weight='balanced')
lr.fit(bow_matrix_train, y_train)
print('best param: solver = ' +str(best_solver)+ ", c: "+str(best_c)+" with accuracy:" + str(best_score))

In [None]:

warnings.filterwarnings("ignore")
param={'solver': ['lbfgs', 'sag','newton-cholesky'],
       'c':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100]}
best_score = 0
best_solver = ''
best_c = 0
for s in param['solver']:
    for c in param['c']:
        lr = LogisticRegression(C = c, solver = s, class_weight='balanced')
        lr.fit(tfidf_matrix_train, y_train)
        result = lr.score(tfidf_matrix_test, y_test)
        print("Solver: "+str(s)+", c: "+str(c)+", accurancy: " + str(result))
        if result > best_score:
            best_score = result
            best_solver = s
            best_c = c
lr =LogisticRegression(C = best_c, solver = best_solver, class_weight='balanced')
lr.fit(bow_matrix_train, y_train)
print('best param: solver = ' +str(best_solver)+ ", c: "+str(best_c)+" with accuracy:" + str(best_score))

In [None]:

warnings.filterwarnings("ignore")

X_train_o, y_train_o = oversample.fit_resample(bow_matrix_train, y_train)
    
param={'solver': ['lbfgs', 'sag','newton-cholesky'],
       'c':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100]}
best_score = 0
best_solver = ''
best_c = 0
for s in param['solver']:
    for c in param['c']:
        lr = LogisticRegression(C = c, solver = s, class_weight='balanced')
        lr.fit(X_train_o, y_train_o)
        result = lr.score(bow_matrix_test, y_test)
        print("Solver: "+str(s)+", c: "+str(c)+", accurancy: " + str(result))
        if result > best_score:
            best_score = result
            best_solver = s
            best_c = c
lr =LogisticRegression(C = best_c, solver = best_solver, class_weight='balanced')
lr.fit(bow_matrix_train, y_train)
print('best param: solver = ' +str(best_solver)+ ", c: "+str(c)+" with accuracy:" + str(best_score))

In [43]:

warnings.filterwarnings("ignore")

X_train_o, y_train_o = oversample.fit_resample(tfidf_matrix_train, y_train)
    
param={'solver': ['lbfgs', 'sag','newton-cholesky'],
       'c':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100]}
best_score = 0
best_solver = ''
best_c = 0
for s in param['solver']:
    for c in param['c']:
        lr = LogisticRegression(C = c, solver = s, class_weight='balanced')
        lr.fit(X_train_o, y_train_o)
        result = lr.score(tfidf_matrix_test, y_test)
        print("Solver: "+str(s)+", c: "+str(c)+", accurancy: " + str(result))
        if result > best_score:
            best_score = result
            best_solver = s
            best_c = c
lr =LogisticRegression(C = best_c, solver = best_solver, class_weight='balanced')
lr.fit(bow_matrix_train, y_train)
print('best param: solver = ' +str(best_solver)+ ", c: "+str(c)+" with accuracy:" + str(best_score))

Solver: lbfgs, c: 0.001, accurancy: 0.621
Solver: lbfgs, c: 0.005, accurancy: 0.655
Solver: lbfgs, c: 0.01, accurancy: 0.663
Solver: lbfgs, c: 0.05, accurancy: 0.687
Solver: lbfgs, c: 0.1, accurancy: 0.698
Solver: lbfgs, c: 0.5, accurancy: 0.716
Solver: lbfgs, c: 1, accurancy: 0.718
Solver: lbfgs, c: 5, accurancy: 0.729
Solver: lbfgs, c: 10, accurancy: 0.732
Solver: lbfgs, c: 50, accurancy: 0.723
Solver: lbfgs, c: 100, accurancy: 0.721
Solver: sag, c: 0.001, accurancy: 0.621
Solver: sag, c: 0.005, accurancy: 0.655
Solver: sag, c: 0.01, accurancy: 0.663
Solver: sag, c: 0.05, accurancy: 0.687
Solver: sag, c: 0.1, accurancy: 0.698
Solver: sag, c: 0.5, accurancy: 0.716
Solver: sag, c: 1, accurancy: 0.718
Solver: sag, c: 5, accurancy: 0.728
Solver: sag, c: 10, accurancy: 0.732
Solver: sag, c: 50, accurancy: 0.727
Solver: sag, c: 100, accurancy: 0.724
Solver: newton-cholesky, c: 0.001, accurancy: 0.621
Solver: newton-cholesky, c: 0.005, accurancy: 0.655
Solver: newton-cholesky, c: 0.01, accu

In [None]:
warnings.filterwarnings("ignore")

alpha = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100]
best_score = 0
best_alpha = 0
for a in alpha:
    
    rr = RidgeClassifier(alpha = a, class_weight='balanced')
    rr.fit(bow_matrix_train, y_train)
    result = rr.score(bow_matrix_test, y_test)
    print("alpha: "+str(a)+", accurancy: " + str(result))
    if result > best_score:
        best_score = result
        best_alpha = a
lr =RidgeClassifier(alpha = best_alpha, class_weight='balanced')
lr.fit(bow_matrix_train, y_train)
print('best param: alpha = ' +str(best_alpha)+" with accuracy:" + str(best_score))

In [None]:
warnings.filterwarnings("ignore")

alpha = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100]
best_score = 0
best_alpha = 0
for a in alpha:
    
    rr = RidgeClassifier(alpha = a, class_weight='balanced')
    rr.fit(tfidf_matrix_train, y_train)
    result = rr.score(tfidf_matrix_test, y_test)
    print("alpha: "+str(a)+", accurancy: " + str(result))
    if result > best_score:
        best_score = result
        best_alpha = a
lr =RidgeClassifier(alpha = best_alpha, class_weight='balanced')
lr.fit(bow_matrix_train, y_train)
print('best param: alpha = ' +str(best_alpha)+" with accuracy:" + str(best_score))

In [None]:
warnings.filterwarnings("ignore")

X_train_o, y_train_o = oversample.fit_resample(bow_matrix_train, y_train)
    
alpha = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100]
best_score = 0
best_alpha = 0
for a in alpha:
    
    rr = RidgeClassifier(alpha = a, class_weight='balanced')
    rr.fit(X_train_o, y_train_o)
    result = rr.score(bow_matrix_test, y_test)
    print("alpha: "+str(a)+", accurancy: " + str(result))
    if result > best_score:
        best_score = result
        best_alpha = a
lr =RidgeClassifier(alpha = best_alpha, class_weight='balanced')
lr.fit(bow_matrix_train, y_train)
print('best param: alpha = ' +str(best_alpha)+" with accuracy:" + str(best_score))

In [None]:
warnings.filterwarnings("ignore")

X_train_o, y_train_o = oversample.fit_resample(tfidf_matrix_train, y_train)
    
alpha = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100]
best_score = 0
best_alpha = 0
for a in alpha:
    
    rr = RidgeClassifier(alpha = a, class_weight='balanced')
    rr.fit(X_train_o, y_train_o)
    result = rr.score(tfidf_matrix_test, y_test)
    print("alpha: "+str(a)+", accurancy: " + str(result))
    if result > best_score:
        best_score = result
        best_alpha = a
lr =RidgeClassifier(alpha = best_alpha, class_weight='balanced')
lr.fit(bow_matrix_train, y_train)
print('best param: alpha = ' +str(best_alpha)+" with accuracy:" + str(best_score))

In [None]:
from sklearn.svm import SVC

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV

In [None]:
C_range = np.logspace(-2, 5, 8)
gamma_range = np.logspace(-6, 1, 16)


In [None]:
%%time
X_train, X_test, y_train, y_test = train_test_split(bow_matrix_1, d1['label'], test_size=0.2, random_state=42)
svm = SVC()
svm.fit(X_train, y_train)

# Predict on the test set.
y_pred = svm.predict(X_test)

# Evaluate the model.
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
#roc_auc = roc_auc_score(y_test, svm.predict_proba(X_test)[:, 1])
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"f1: {f1}")
#print(f"roc_auc_score: {roc_auc}")
print(report)

In [None]:
%%time
X_train, X_test, y_train, y_test = train_test_split(bow_matrix_2, d2['label'], test_size=0.2, random_state=42)
svm = SVC()
svm.fit(X_train, y_train)

# Predict on the test set.
y_pred = svm.predict(X_test)

# Evaluate the model.
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
#roc_auc = roc_auc_score(y_test, svm.predict_proba(X_test)[:, 1])
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"f1: {f1}")
#print(f"roc_auc_score: {roc_auc}")
print(report)

In [None]:
%%time
X_train, X_test, y_train, y_test = train_test_split(bow_matrix_2, d2['label'], test_size=0.2, random_state=42)
svm = SVC(class_weight='balanced')
svm.fit(X_train, y_train)

# Predict on the test set.
y_pred = svm.predict(X_test)

# Evaluate the model.
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
#roc_auc = roc_auc_score(y_test, svm.predict_proba(X_test)[:, 1])
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"f1: {f1}")
#print(f"roc_auc_score: {roc_auc}")
print(report)

In [None]:
%%time
X_train_o, y_train_o = oversample.fit_resample(X_train, y_train)

#X_train, X_test, y_train, y_test = train_test_split(bow_matrix_2, d2['label'], test_size=0.2, random_state=42)
svm = SVC()
svm.fit(X_train_o, y_train_o)

# Predict on the test set.
y_pred = svm.predict(X_test)

# Evaluate the model.
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
#roc_auc = roc_auc_score(y_test, svm.predict_proba(X_test)[:, 1])
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"f1: {f1}")
#print(f"roc_auc_score: {roc_auc}")
print(report)

In [None]:
%%time
X_train_o, y_train_o = oversample.fit_resample(X_train, y_train)

#X_train, X_test, y_train, y_test = train_test_split(bow_matrix_2, d2['label'], test_size=0.2, random_state=42)
svm = SVC(class_weight='balanced')
svm.fit(X_train_o, y_train_o)

# Predict on the test set.
y_pred = svm.predict(X_test)

# Evaluate the model.
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
#roc_auc = roc_auc_score(y_test, svm.predict_proba(X_test)[:, 1])
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"f1: {f1}")
#print(f"roc_auc_score: {roc_auc}")
print(report)

In [None]:
%%time
evaluate(tfidf_matrix_train, y_train, tfidf_matrix_test, y_test, SVC(class_weight='balanced', probability = True), d1_len, svmsmote)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier()

In [27]:
test_set = pd.read_json(path_or_buf="./data/test_Set.json", lines=True)

In [31]:
model = LogisticRegression(class_weight='balanced')

X_train_2_o, y_train_2_o = oversample.fit_resample(tfidf_matrix_train[d1_len:], y_train[d1_len:])
X_train_o = sp.sparse.vstack((tfidf_matrix_train[:d1_len], X_train_2_o))
y_train_o = np.append(y_train[:d1_len], y_train_2_o)

model.fit(X_train_o, y_train_o)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
test = vectorizer.transform(test_set['text'].apply(lambda x: " ".join(map(str, x))))
test = tfidf.transform(test)

In [33]:
prediction = model.predict(test)

In [35]:
test_set['class'] = prediction

In [39]:
test_set[['id', 'class']]

Unnamed: 0,id,class
0,0,1
1,1,0
2,2,1
3,3,0
4,4,0
...,...,...
995,995,0
996,996,1
997,997,0
998,998,1


In [40]:
output = test_set[['id', 'class']]

In [42]:
output.to_csv('output.csv',index=False)