In [1]:
! pip install transformers



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import csv
import os
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import tensorflow as tf
from  transformers import BertTokenizer, TFBertForSequenceClassification,TFBertModel
from tensorflow.keras.utils import to_categorical
from transformers import BertConfig, BertModel
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
import matplotlib.pyplot  as plt
from collections import defaultdict
import multiprocessing
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.metrics import roc_curve,accuracy_score, recall_score, precision_score
from sklearn.utils import shuffle
class ReadData:

    def __init__(self,path, X_col,y_col):

        self.path=path
        self.X_col=X_col
        self.y_col=y_col
        self.tar_cols=[X_col]+[y_col]
        self.data=self.open_file(self.path)
        self.X_train=pd.DataFrame()
        self.y_train=pd.DataFrame()
        self.X_val=pd.DataFrame()
        self.y_val=pd.DataFrame()
        self.X_test=pd.DataFrame()
        self.y_test=pd.DataFrame()
        self.X_train_resampled=pd.DataFrame()
        self.y_train_resampled=pd.DataFrame()


    def open_file(self,path):
        with open(path, 'r', newline='', encoding='utf-8') as f:
            dialect = csv.Sniffer().sniff(f.read(18192))
            f.seek(0)
            reader = csv.reader(f, dialect)
            header = next(reader)
            data = pd.DataFrame(reader,  columns=header)

        data=data[self.tar_cols]
        data = data[(data != '').all(axis=1)]
        data = shuffle(data,random_state=0)
       # data = data.sample(n=50000)
        return data

    def data_split(self):

        self.X_train, X_temp, self.y_train, y_temp = train_test_split(self.data[self.X_col],self.data[self.y_col],random_state=0,test_size=0.2, stratify=self.data[self.y_col])
        self.X_test, self.X_val, self.y_test, self.y_val = train_test_split(X_temp,y_temp,random_state=0,test_size=0.5, stratify=y_temp)
        return self.X_train, self.y_train,self.X_val, self.y_val,  self.X_test, self.y_test

    def resample_over(self, X, y):
        oversample = RandomOverSampler(sampling_strategy='minority',random_state=0)
        self.X_train_resampled, self.y_train_resampled = oversample.fit_resample(X.to_frame(), y.to_frame())
        return self.X_train_resampled.squeeze(), self.y_train_resampled.squeeze()

    def resample_under(self, X, y):
        undersample = RandomUnderSampler(sampling_strategy='majority',random_state=0)
        self.X_resampled, self.y_resampled = undersample.fit_resample(X.to_frame(), y.to_frame())
        shuffled_df = pd.concat([self.X_resampled, self.y_resampled], axis=1)
        shuffled_df = shuffled_df.sample(frac=1, random_state=42)
        self.X_resampled=shuffled_df[self.X_col]
        self.y_resampled=shuffled_df[self.y_col]
        return self.X_resampled.squeeze(), self.y_resampled.squeeze()


    def to_dataframe(self,X,y):
        return pd.concat([X, y], axis=1)

read_data=ReadData("/content/drive/MyDrive/Masterthesis/Data/data_processed/preprocessed500000_for_sft.tsv","preprocessed_text","sentiment")


X_train, y_train,X_val, y_val,  X_test, y_test = read_data.data_split()
X_under, y_under=read_data.resample_under(X_train, y_train)
df_test=read_data.to_dataframe(X_test, y_test)
df_val=read_data.to_dataframe(X_val, y_val)
df_under=read_data.to_dataframe(X_under, y_under)



In [None]:
df_test.to_csv("/content/drive/MyDrive/Masterthesis/Data/data_processed/test_for_sft.tsv", sep='\t', index=False)
df_under.to_csv("/content/drive/MyDrive/Masterthesis/Data/data_processed/train_for_sft.tsv", sep='\t', index=False)
df_val.to_csv("/content/drive/MyDrive/Masterthesis/Data/data_processed/val_for_sft.tsv", sep='\t', index=False)