In [1]:
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from time import time
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

pd.options.mode.chained_assignment = None



In [2]:
def load_data():
    df=pd.read_csv("fake.csv")
    print("reading done")
    print(df.size)
    return df


def data_cleaning(df):
    
    # Pre-processing Text Reviews
    # Remove Stop Words
    stop = stopwords.words('english')
    df['text_'] = df['text_'].apply(
        lambda x: ' '.join(word for word in str(x).split() if word not in stop))

    # Remove Punctuations
    tokenizer = RegexpTokenizer(r'\w+')
    df['text_'] = df['text_'].apply(
        lambda x: ' '.join(word for word in tokenizer.tokenize(x)))

    # Lowercase Words
    df['text_'] = df['text_'].apply(
        lambda x: str(x).lower())
    
    df.drop([ 'category'], axis=1, inplace=True)
  
    print("Data Cleaning Complete")
    return df


In [8]:
def feature_engineering(df, batch_size=3000):
    print("Feature Engineering: Creating New Features")

    def process_batch(batch):
        sid = SentimentIntensityAnalyzer()
        for index, row in batch.iterrows():
            text1 = row['text_']
            sentiment_dict = sid.polarity_scores(text1)
            df.at[index, 'sentiment'] = sentiment_dict['compound']

        def text_process(review):
            nopunc = [char for char in review if char not in string.punctuation]
            nopunc = ''.join(nopunc)
            return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

        bow_transformer = CountVectorizer(analyzer=text_process)
        bow_transformer.fit(batch['text_'])
        print("Batch Vocabulary:", len(bow_transformer.vocabulary_))

        bow_reviews = bow_transformer.transform(batch['text_'])

        tfidf_transformer = TfidfTransformer().fit(bow_reviews)
        tfidf_reviews = tfidf_transformer.transform(bow_reviews)
        df.loc[batch.index, 'tfidf'] = pd.Series(tfidf_reviews.toarray().tolist())

    # Split data into batches
    num_batches = int(np.ceil(len(df) / batch_size))
    for i in range(num_batches):
        batch_start = i * batch_size
        batch_end = min((i + 1) * batch_size, len(df))
        batch = df.iloc[batch_start:batch_end].copy()
        process_batch(batch)

    print("Feature Engineering Complete")
    return df


In [9]:
def under_sampling(df):
    print("Under-Sampling Data")
    # Count of Reviews
    # print("Authentic", len(df[(df['flagged'] == 'N')]))
    # print("Fake", len(df[(df['flagged'] == 'Y')]))

    sample_size = len(df[(df['label'] == 'OR')])

    authentic_reviews_df = df[df['label'] == 'OR']
    fake_reviews_df = df[df['label'] == 'CG']

    authentic_reviews_us_df = authentic_reviews_df.sample(sample_size)
    under_sampled_df = pd.concat([authentic_reviews_us_df, fake_reviews_df], axis=0)

    print("Under-Sampling Complete")
    return under_sampled_df


In [10]:
def semi_supervised_learning(df, model, algorithm, threshold=0.8, iterations=40):
    df = df.copy()
    print("Training "+algorithm+" Model")
    labels = df['label']

    df.drop([ 'label','text_'], axis=1, inplace=True)

    train_data, test_data, train_label, test_label = train_test_split(df, labels, test_size=0.3, random_state=40)

    test_data_copy = test_data.copy()
    test_label_copy = test_label.copy()

    all_labeled = False

    current_iteration = 0

    pbar = tqdm(total=iterations)

    while not all_labeled and (current_iteration < iterations):
        # print("Before train data length : ", len(train_data))
        # print("Before test data length : ", len(test_data))
        current_iteration += 1
        model.fit(train_data, train_label)

        probabilities = model.predict_proba(test_data)
        pseudo_labels = model.predict(test_data)

        indices = np.argwhere(probabilities > threshold)

        # print("rows above threshold : ", len(indices))
        for item in indices:
            train_data.loc[test_data.index[item[0]]] = test_data.iloc[item[0]]
            train_label.loc[test_data.index[item[0]]] = pseudo_labels[item[0]]
        test_data.drop(test_data.index[indices[:, 0]], inplace=True)
        test_label.drop(test_label.index[indices[:, 0]], inplace=True)
        # print("After train data length : ", len(train_data))
        # print("After test data length : ", len(test_data))
        print("--" * 20)

        if len(test_data) == 0:
            print("Exiting loop")
            all_labeled = True
        pbar.update(1)
    pbar.close()
    predicted_labels = model.predict(test_data_copy)

    # print('Best Params : ', grid_clf_acc.best_params_)
    print(algorithm + ' Model Results')
    print('--' * 20)
    print('Accuracy Score : ' + str(accuracy_score(test_label_copy, predicted_labels)))
    print('Precision Score : ' + str(precision_score(test_label_copy, predicted_labels, pos_label="CG")))
    print('Recall Score : ' + str(recall_score(test_label_copy, predicted_labels, pos_label="CG")))
    print('F1 Score : ' + str(f1_score(test_label_copy, predicted_labels, pos_label="CG")))
    print('Confusion Matrix : \n' + str(confusion_matrix(test_label_copy, predicted_labels)))
    #plot_confusion_matrix(test_label_copy, predicted_labels, classes=['OR', 'CG'],title=algorithm + ' Confusion Matrix').show()


In [11]:
def main():
    start_time = time()
    df = load_data()
    print(df)
    df = data_cleaning(df)
    print(df)
    df = feature_engineering(df)
    #under_sampled_df = df
    under_sampled_df = under_sampling(df)
    rf = RandomForestClassifier(random_state=42, criterion='entropy', max_depth=14, max_features='sqrt',n_estimators=500)
    nb = GaussianNB()
    print(df)
    semi_supervised_learning(under_sampled_df, model=rf, threshold=0.8, iterations=5, algorithm='Random Forest')
    semi_supervised_learning(under_sampled_df, model=nb, threshold=0.8, iterations=5, algorithm='Naive Bayes')
    end_time = time()
    print("Time taken : ", end_time - start_time)


In [12]:
main()

reading done
162376
                 category  rating label  \
0      Home_and_Kitchen_5     5.0    CG   
1      Home_and_Kitchen_5     5.0    CG   
2      Home_and_Kitchen_5     5.0    CG   
3      Home_and_Kitchen_5     1.0    CG   
4      Home_and_Kitchen_5     5.0    CG   
...                   ...     ...   ...   
40589       Electronics_5     4.0    OR   
40590       Electronics_5     4.0    OR   
40591       Electronics_5     4.0    OR   
40592       Electronics_5     4.0    OR   
40593       Electronics_5     4.0    OR   

                                                   text_  
0      Love this!  Well made, sturdy, and very comfor...  
1      love it, a great upgrade from the original.  I...  
2      This pillow saved my back. I love the look and...  
3      Missing information on how to use it, but it i...  
4      Very nice set. Good quality. We have had the s...  
...                                                  ...  
40589                                       nice p


  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A

ValueError: setting an array element with a sequence.