# Parameters

In [None]:
raw_data_path = '/content/drive/My Drive/Sihem/BERT_Classifier/Data/data.csv'
destination_folder = '/content/drive/My Drive/Sihem/BERT_Classifier/Data'

train_test_ratio = 0.20
train_valid_ratio = 0.80

first_n_words = 200

# Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Preprocessing

In [None]:
def trim_string(x):

    x = x.split(maxsplit=first_n_words)
    x = ' '.join(x[:first_n_words])

    return x

In [None]:
# Mount Google Drive to this Notebook instance.
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Read raw data
df_raw = pd.read_csv(raw_data_path)
df_raw = df_raw.dropna()


# Prepare columns
#df_raw['label'] = (df_raw['label'] == 'FAKE').astype('int')
#df_raw['titletext'] = df_raw['title'] + ". " + df_raw['text']
#df_raw = df_raw.reindex(columns=['label', 'title', 'text', 'titletext'])

# Drop rows with empty text
#df_raw.drop( df_raw[df_raw.text.str.len() < 5].index, inplace=True)

# Trim text and titletext to first_n_words
df_raw['text'] = df_raw['text'].apply(trim_string)
#df_raw['titletext'] = df_raw['titletext'].apply(trim_string)

# Split according to label
df_non_medical = df_raw[df_raw['label'] == 0]
df_medical = df_raw[df_raw['label'] == 1]

# Train-test split
df_non_medical_full_train, df_non_medical_test = train_test_split(df_non_medical, test_size = train_test_ratio, random_state = 1)
df_medical_full_train, df_medical_test = train_test_split(df_medical, test_size = train_test_ratio, random_state = 1)

# Train-valid split
df_non_medical_train, df_non_medical_valid = train_test_split(df_non_medical_full_train, train_size = train_valid_ratio, random_state = 1)
df_medical_train, df_medical_valid = train_test_split(df_medical_full_train, train_size = train_valid_ratio, random_state = 1)

# Concatenate splits of different labels
df_train = pd.concat([df_non_medical_train, df_medical_train], ignore_index=True, sort=False)
df_valid = pd.concat([df_non_medical_valid, df_medical_valid], ignore_index=True, sort=False)
df_test = pd.concat([df_non_medical_test, df_medical_test], ignore_index=True, sort=False)

# Write preprocessed data
df_train.to_csv(destination_folder + '/train.csv', index=False)
df_valid.to_csv(destination_folder + '/valid.csv', index=False)
df_test.to_csv(destination_folder + '/test.csv', index=False)

In [None]:
print(len(df_train))
print(len(df_valid))
print(len(df_test))