In [59]:
import pickle
import pandas as pd
from sklearn import preprocessing
import re
import scipy
import numpy as np
import glob
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer
from gensim.models.doc2vec import TaggedDocument

In [154]:

class UsePretrainedVectClf:
    def __init__(self, pathToTokenizer, pathToClfModel, pathToWord2VecPretrained, path_to_PreprocessedDataset, \
                 path_to_nonPreprocessedDataset, include_rules = False):
        self.path_to_nonPreprocessedDataset = path_to_nonPreprocessedDataset
        self.path_to_PreprocessedDataset = path_to_PreprocessedDataset
        self.include_rules = include_rules
        self.pathToTokenizer = pathToTokenizer
        self.pathToClfModel = pathToClfModel
        self.pathToWord2VecPretrained = pathToWord2VecPretrained
    
    def traditional_ChangeVectPath(self, pathToTokenizer):
        self.pathToTokenizer = pathToTokenizer   
        
    def traditional_ChangeClfModelPath(self, pathToClfModel):
        self.pathToClfModel = pathToClfModel     
        
    def traditional_loadVect(self):
        Vect = pickle.load(open(self.pathToTokenizer, 'rb'))
        return Vect
    
    def apply_gen_rules_features_ngrams(self, X, X_train_WoR_dtm, X_train_WoR_Features):
        X_Rules_dtm, features_Rules = self.gen_rules_features_ngrams(X)
        X_train_WR_dtm, combined_features = self.concat_sparse_matrices_h(X_train_WoR_dtm, X_Rules_dtm,
                                                                          X_train_WoR_Features, features_Rules)
        return X_train_WR_dtm, combined_features


    def gen_rules_features_ngrams(self, X_data_series):  # , X_data_dtm, features_arg):
        '''sparse matrix and series matrices should be converted to dataframe for applying rules and treating
        it as features...
        I wrote two functions i.e.,sparse_matrix_to_DataFrame() and series_DataFrame()
          for changing datatypes'''
        
        X_data_DF = self.series_to_DataFrame(X_data_series)
        regexes = [
            re.compile(r'\b(I|we)\b.*\b(am|are|will be)\b.*\b(bringing|giving|helping|raising|donating|auctioning)\b',
                       re.I | re.M),
            re.compile(r'\b(I\'m)\b.*\b(bringing|giving|helping|raising|donating|auctioning)\b', re.I | re.M),
            re.compile(r'\b(we\'re)\b.*\b(bringing|giving|helping|raising|donating|auctioning)\b', re.I | re.M),
            re.compile(r'\b(I|we)\b.*\b(will|would like to)\b.*\b(bring|give|help|raise|donate|auction)\b',
                       re.I | re.M),
            re.compile(r'\b(I|we)\b.*\b(will|would like to)\b.*\b(work|volunteer|assist)\b', re.I | re.M),
            re.compile(r'\b(we\'ll)\b.*\b(bring|give|help|raise|donate|auction)\b', re.I | re.M),
            re.compile(r'\b(I|we)\b.*\b(ready|prepared)\b.*\b(bring|give|help|raise|donate|auction)\b', re.I | re.M),
            re.compile(r'\b(where)\b.*\b(can I|can we)\b.*\b(bring|give|help|raise|donate)\b', re.I | re.M),
            re.compile(r'\b(where)\b.*\b(can I|can we)\b.*\b(work|volunteer|assist)\b', re.I | re.M),
            re.compile(r'\b(I|we)\b.*\b(like|want)\b.*\bto\b.*\b(bring|give|help|raise|donate)\b', re.I | re.M),
            re.compile(r'\b(I|we)\b.*\b(like|want)\b.*\bto\b.*\b(work|volunteer|assist)\b', re.I | re.M),
            re.compile(r'\b(will be)\b.*\b(brought|given|raised|donated|auctioned)\b', re.I | re.M),
            re.compile(r'\b\w*\s*\b\?', re.I | re.M),
            re.compile(r'\b(you|u).*(can|could|should|want to)\b', re.I | re.M),
            re.compile(r'\b(can|could|should).*(you|u)\b', re.I | re.M),
            re.compile(r'\b(like|want)\b.*\bto\b.*\b(bring|give|help|raise|donate)\b', re.I | re.M),
            re.compile(r'\b(how)\b.*\b(can I|can we)\b.*\b(bring|give|help|raise|donate)\b', re.I | re.M),
            re.compile(r'\b(how)\b.*\b(can I|can we)\b.*\b(work|volunteer|assist)\b', re.I | re.M)

        ]
        temp = pd.DataFrame()
        features_arg = []
        for i, regex in zip(range(len(regexes)), regexes):
            columnName = "RegEx_" + str(i + 1)
            features_arg.append(columnName)
            temp[columnName] = X_data_DF['tweet_text'].apply(lambda text: self.apply_regex_ngrams(text, regex))
        temp_sparse = scipy.sparse.csr_matrix(temp.values)
        return temp_sparse, features_arg
    
    def series_to_DataFrame(self, X_data):
        X_data = X_data.to_frame()
        return X_data
    
    def concat_sparse_matrices_h(self, data_X_dtm, data_Rules_dtm, features_X, features_Rules):
        combined_features = features_X + features_Rules
        concat_sparse = scipy.sparse.hstack([data_X_dtm, data_Rules_dtm], format='csr')
        return concat_sparse, combined_features
    
    def apply_regex_ngrams(self, text, regex):
        match_found = (re.search(regex, text) != None)
        match_found = int(match_found == True)
        return match_found
    
    def traditional_getFeatures(self, Vect):
        df = pd.read_csv(self.path_to_PreprocessedDataset, encoding = "ISO-8859-1")
        X_train_WoR_dtm = Vect.transform(df["tweet_text"])
        X_train_Features = list(Vect.get_feature_names_out())
        if self.include_rules:
            df_notPreprocessed = pd.read_csv(self.path_to_nonPreprocessedDataset,  encoding = "ISO-8859-1")
            X_train_dtm, X_train_Features = self.apply_gen_rules_features_ngrams(df_notPreprocessed["tweet_text"], \
                                                                      X_train_WoR_dtm, X_train_Features)
        else:
            X_train_dtm = X_train_WoR_dtm
        return X_train_dtm.toarray(), Vect

    def traditional_loadClf(self):  
        clf_model = pickle.load(open(self.pathToClfModel, 'rb'))
        return clf_model
    
    def get_predictions(self, clf_model, X):
        return clf_model.predict(X)
    
    def generateNameFromDataSetName(self, list1):
        str1 = ""
        for e in list1:
            if e == "":
                str1 += '.'.join(str(e))
            else:
                if e == "csv":
                    e = "_withPredictions." + e
                    str1 += ''.join(str(e))
                else:
                    str1 += ''.join(str(e))
        return ".." + str1        
    
    def evaluation(self):
        if "WRul" in self.pathToTokenizer:
            if not self.include_rules:
                print("Vectoirzer and appending does not match")
        if "WoRul" in self.pathToTokenizer:
            if self.include_rules:
                print("Vectoirzer and appending does not match")
        if "WRul" in self.pathToClfModel:
            if not self.include_rules:
                print("Vectoirzer and appending does not match")
        if "WoRul" in self.pathToClfModel:
            if self.include_rules:
                print("Vectoirzer and appending does not match")
        if not self.pathToClfModel.split("/")[-1].split("_")[5:-1] == self.pathToTokenizer.split("/")[-1].split("_"):
            print("Vectorizer and classifer does not match.")
            print("Vectorizer:", self.pathToTokenizer)
            print("Classifier:", self.pathToClfModel)
            print("Classifier:", self.pathToTokenizer.split("/")[-1].split("_"), "Vectorizer:" , self.pathToClfModel.split("/")[-1].split("_")[5:-1])

    def generatePredNGramsPretrained(self, path_to_nonPreprocessedDataset, path_to_PreprocessedDataset, pathToVectFolder,\
                                         pathToClfModel, path2word2vecModel, pathToWord2VecPretrained):

        NotProcessedDataWithPredictions = pd.read_csv(obj.path_to_nonPreprocessedDataset, encoding = "ISO-8859-1")
        for vect_filename in glob.glob(pathToVectFolder + "/*"):
            print(40*"+")
            obj.traditional_ChangeVectPath(vect_filename)
            Vect = obj.traditional_loadVect()
            for clfModel_filename in glob.glob(pathToClfFolder + "*"):
                if clfModel_filename.split("/")[-1].split("_")[5:-1] == vect_filename.split("/")[-1].split("_"):
                    obj.traditional_ChangeClfModelPath(clfModel_filename)
                    name_cell = obj.pathToClfModel.split("/")[-1].split("_")[5:]
                    if name_cell[-3] == "WRul":
                        obj.include_rules = True
                    else:
                        obj.include_rules = False
                    # print(obj.pathToTokenizer, obj.pathToClfModel)
                    # print(name_cell)
                    # print(name_cell[-5] + "_" + name_cell[-3]  + "_" + name_cell[-4] + "_" + name_cell[-2] + "_" + name_cell[-1])
                    X, _ = obj.traditional_getFeatures(Vect)
                    clf_model = obj.traditional_loadClf()
                    y_pred = obj.get_predictions(clf_model, X)
                    name_cell = name_cell[-5] + "_" + name_cell[-3]  + "_" + name_cell[-4] + "_" + name_cell[-2] + "_" + name_cell[-1]
                    print(name_cell)
                    NotProcessedDataWithPredictions[name_cell] = y_pred.tolist()
        notProcessedDatasetName = obj.generateNameFromDataSetName(obj.path_to_nonPreprocessedDataset.split("."))
        NotProcessedDataWithPredictions.to_csv(notProcessedDatasetName, index=False)
        print(40*"-")
        
    def word_vector_avg(self, inv_tokenizer, fun_word_to_vec_map, tw_sequence, size):
        vec = np.zeros(size)
        count = 0
        columnNameList = []
        for i in range(18):
            columnName = "rule" + str(i + 1)
            columnNameList.append(columnName)
        for seq in tw_sequence:
            try:
                if inv_tokenizer[seq] in columnNameList:
                    embedding_vector = np.ndarray(shape=(size,))
                    embedding_vector[:] = 1/size
                else:
                    embedding_vector = fun_word_to_vec_map[inv_tokenizer[seq]]
                if embedding_vector is not None:
                    vec += embedding_vector
                else:
                    vec += np.zeros(size)
                count += 1.
            except KeyError:  # handling the case where the token is not in vocabulary
                continue
        if count != 0:
            vec /= count
        return vec
   
    
    
    def apply_regex_word2vec(self, text, regex, ruleTokenizerSequenceIndex):
        match_found = (re.search(regex, text) != None)
        if match_found:
            match_found = int(ruleTokenizerSequenceIndex)
        else:
            match_found = 0
        return match_found
    
    def gen_rules_features_word2vec(self, X_data_series, tokenizer):  # , X_data_dtm, features_arg):
        '''sparse matrix and series matrices should be converted to dataframe for applying rules and treating
    it as features...
    I wrote two functions i.e.,sparse_matrix_to_DataFrame() and series_DataFrame()
      for changing datatypes'''
        X_data_DF = self.series_to_DataFrame(X_data_series)
        regexes = [
            re.compile(r'\b(I|we)\b.*\b(am|are|will be)\b.*\b(bringing|giving|helping|raising|donating|auctioning)\b',
                       re.I | re.M),
            re.compile(r'\b(I\'m|Im)\b.*\b(bringing|giving|helping|raising|donating|auctioning)\b', re.I | re.M),
            re.compile(r'\b(we\'re|we are)\b.*\b(bringing|giving|helping|raising|donating|auctioning)\b', re.I | re.M),
            re.compile(r'\b(I|we)\b.*\b(will|would like to)\b.*\b(bring|give|help|raise|donate|auction)\b',
                       re.I | re.M),
            re.compile(r'\b(I|we)\b.*\b(will|would like to)\b.*\b(work|volunteer|assist)\b', re.I | re.M),
            re.compile(r'\b(we\'ll|we will)\b.*\b(bring|give|help|raise|donate|auction)\b', re.I | re.M),
            re.compile(r'\b(I|we)\b.*\b(ready|prepared)\b.*\b(bring|give|help|raise|donate|auction)\b', re.I | re.M),
            re.compile(r'\b(where)\b.*\b(can I|can we)\b.*\b(bring|give|help|raise|donate)\b', re.I | re.M),
            re.compile(r'\b(where)\b.*\b(can I|can we)\b.*\b(work|volunteer|assist)\b', re.I | re.M),
            re.compile(r'\b(I|we)\b.*\b(like|want)\b.*\bto\b.*\b(bring|give|help|raise|donate)\b', re.I | re.M),
            re.compile(r'\b(I|we)\b.*\b(like|want)\b.*\bto\b.*\b(work|volunteer|assist)\b', re.I | re.M),
            re.compile(r'\b(will be)\b.*\b(brought|given|raised|donated|auctioned)\b', re.I | re.M),
            re.compile(r'\b\w*\s*\b\?', re.I | re.M),
            re.compile(r'\b(you|u).*(can|could|should|want to)\b', re.I | re.M),
            re.compile(r'\b(can|could|should).*(you|u)\b', re.I | re.M),
            re.compile(r'\b(like|want)\b.*\bto\b.*\b(bring|give|help|raise|donate)\b', re.I | re.M),
            re.compile(r'\b(how)\b.*\b(can I|can we)\b.*\b(bring|give|help|raise|donate)\b', re.I | re.M),
            re.compile(r'\b(how)\b.*\b(can I|can we)\b.*\b(work|volunteer|assist)\b', re.I | re.M)

        ]
        temp = pd.DataFrame()
        features_arg = []
        for i, regex in zip(range(len(regexes)), regexes):
            # we can also use ruleString()
            columnName = "rule" + str(i + 1)
            features_arg.append(columnName)
            temp[columnName] = X_data_DF['tweet_text'].apply(lambda text: self.apply_regex_word2vec(text, regex, tokenizer.word_index[columnName]))

        temp_sparse = scipy.sparse.csr_matrix(temp.values)

        return temp_sparse, features_arg
    
    def apply_gen_rules_features_word2vec(self, X, X_train_WoR_dtm, tokenizer, X_train_WoR_Features):
        X_Rules_dtm, features_Rules = self.gen_rules_features_word2vec(X, tokenizer)

        X_train_WR_dtm, combined_features = self.concat_sparse_matrices_h(X_train_WoR_dtm, X_Rules_dtm,
                                                                          X_train_WoR_Features, features_Rules)
        return X_train_WR_dtm, combined_feature
    

    def word2vec_ChangeWord2VecPretrainedPath(self, pathToWord2VecPretrained):
        self.pathToWord2VecPretrained = pathToWord2VecPretrained 

    def word2vec_loadWord2VecPretrained(self):  
        word2vec_model = pickle.load(open(self.pathToWord2VecPretrained, 'rb'))
        return word2vec_model
    


    def invTokenizer(self, tokenizer):
        inv_tokenizer = {v: k for k, v in tokenizer.word_index.items()}
        return inv_tokenizer

    def word2vec_Vect(self, pathToWord2VecPretrained, df):
        tokenizer = pickle.load(open(self.pathToTokenizer, 'rb'))
        embed_dim = int(self.pathToTokenizer.split("Freq-")[1])
        if "WRul" in self.pathToTokenizer:
            MAX_SEQUENCE_LENGTH = int(self.pathToTokenizer.split("Seq-")[1].split("_")[0]) - 18
        else:
            MAX_SEQUENCE_LENGTH = int(self.pathToTokenizer.split("Seq-")[1].split("_")[0])

        sequences = tokenizer.texts_to_sequences(df['tweet_text'])
        X_train_WoR_dtm = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
   
        X_train_Features = ["_Not_Apply_"]
        # print(type(X_train_WoR_dtm), X_train_WoR_dtm.shape)
        if self.include_rules:
            df_notPreprocessed = pd.read_csv(self.path_to_nonPreprocessedDataset)
            X_train_dtm, _ = self.apply_gen_rules_features(df_notPreprocessed["tweet_text"], X_train_WoR_dtm, tokenizer, \
                                                      X_train_Features)
        else:
            X_train_dtm = X_train_WoR_dtm
        inv_tokenizer = self.invTokenizer(tokenizer)
        fun_word_to_vec_map = self.load_word2vec_model(pathToWord2VecPretrained)
        vector = []
       
        for tw_sequence in X_train_dtm:
            vec = self.word_vector_avg(inv_tokenizer, fun_word_to_vec_map, tw_sequence, embed_dim)
            vector.append(vec)
        X_train_dtm = np.array(vector)
        return X_train_dtm
    
    def load_word2vec_model(self, path):
        if "Model" not in path:
            print("You may not loading a model; You may loading a tokenizer.")
        if "glove".lower() in path.lower().split("/")[-1]:
            word_to_vec_map = pickle.load(open(path, 'rb'))
        elif "word2vec".lower() in path.lower().split("/")[-1]:
            word_to_vec_map = pickle.load(open(path, 'rb'))
        elif "google".lower() in path.lower().split("/")[-1]:
            word_to_vec_map = pickle.load(open(path, 'rb'))
        elif "doc2vec".lower() in path.lower().split("/")[-1]:
            word_to_vec_map = pickle.load(open(path, 'rb'))
            
        elif "crisisNLP2vec".lower() in path.lower().split("/")[2]:
             word_to_vec_map = pickle.load(open(path, 'rb'))
        return word_to_vec_map
    
    def get_predictions(self, clf_model, X):
        return clf_model.predict(X)
    
    def generateNameFromDataSetName(self, list1):
        str1 = ""
        for e in list1:
            if e == "":
                str1 += '.'.join(str(e))
            else:
                if e == "csv":
                    e = "_withPredictions." + e
                    str1 += ''.join(str(e))
                else:
                    str1 += ''.join(str(e))
        return ".." + str1

    def generatePredWord2VecPretrained(self, path_to_nonPreprocessedDataset, path_to_PreprocessedDataset, pathToVectFolder,\
                                     pathToClfModel, path2word2vecModel, pathToWord2VecPretrained):
        NotProcessedDataWithPredictions = pd.read_csv(obj.path_to_nonPreprocessedDataset, encoding = "ISO-8859-1")
        df = pd.read_csv(obj.path_to_PreprocessedDataset,  encoding = "ISO-8859-1")
        for vect_filename in glob.glob(pathToVectFolder + "/*"):
            print(40*"+")
            obj.traditional_ChangeVectPath(vect_filename)
            Vect = obj.traditional_loadVect()
            for pretrainedWord2VecModel_filename in glob.glob(pathToWord2VecPretrained + "*"): 
                if pretrainedWord2VecModel_filename.split("/")[-1].replace("Model", "Vect").split("_") == \
                vect_filename.split("/")[-1].split("_"):
                    obj.word2vec_ChangeWord2VecPretrainedPath(pretrainedWord2VecModel_filename)
                    for clfModel_filename in glob.glob(pathToClfFolder + "*"):
                        if clfModel_filename.split("/")[-1].split("_")[6:-1] == pretrainedWord2VecModel_filename.split("/")[-1].replace("Model", "Vect").split("_"):
                            obj.traditional_ChangeClfModelPath(clfModel_filename)
                            name_cell = obj.pathToClfModel.split("/")[-1].split("_")[5:]
                            if name_cell[-3] == "WRul":
                                obj.include_rules = True
                            else:
                                obj.include_rules = False
                            X = obj.word2vec_Vect(pretrainedWord2VecModel_filename, df)
                            clf_model = obj.traditional_loadClf()
                            y_pred = obj.get_predictions(clf_model, X)
                            name_cell = name_cell[-6] + "_" + name_cell[-3]  + "_" + name_cell[-4] + "_" + name_cell[-2] + "_" + name_cell[-1]
                            print(name_cell)
                            NotProcessedDataWithPredictions[name_cell] = y_pred.tolist()
        notProcessedDatasetName = obj.generateNameFromDataSetName(obj.path_to_nonPreprocessedDataset.split("."))
        NotProcessedDataWithPredictions.to_csv(notProcessedDatasetName, index=False)
        print(40*"-")



In [155]:
pathToClfModel = "" 
path2word2vecModel = 'dummyPath/word2vecModels/word2vec'
pathToWord2VecPretrained = "../word2vec_preTrainedModel/"

path_to_nonPreprocessedDataset = "../datasets/500_random_sample.csv"
path_to_PreprocessedDataset = "../datasets/500_random_sample_processed.csv"

pathToVectFolder = "../simple_vect/"
pathToClfFolder = "../simple_clfModels/"
obj = UsePretrainedVectClf(pathToVectFolder, pathToClfModel, pathToWord2VecPretrained, path_to_PreprocessedDataset, \
                   path_to_nonPreprocessedDataset)
obj.generatePredNGramsPretrained(path_to_nonPreprocessedDataset, path_to_PreprocessedDataset, pathToVectFolder,\
                                 pathToClfModel, path2word2vecModel, pathToWord2VecPretrained)

path_to_nonPreprocessedDataset = "../datasets/500_random_sample_withPredictions.csv"
path_to_PreprocessedDataset = "../datasets/500_random_sample_processed.csv"

pathToVectFolder = "../tfidf_vect/"
pathToClfFolder = "../tfidf_clfModels/"
obj = UsePretrainedVectClf(pathToVectFolder, pathToClfModel, pathToWord2VecPretrained, path_to_PreprocessedDataset, \
                   path_to_nonPreprocessedDataset)
obj.generatePredNGramsPretrained(path_to_nonPreprocessedDataset, path_to_PreprocessedDataset, pathToVectFolder,\
                                 pathToClfModel, path2word2vecModel, pathToWord2VecPretrained)

path_to_nonPreprocessedDataset = "../datasets/500_random_sample_withPredictions_withPredictions.csv"
path_to_PreprocessedDataset = "../datasets/500_random_sample_processed.csv"

pathToVectFolder = "../word2vec_vect/"
pathToClfFolder = "../word2vec_clfModels/"
obj = UsePretrainedVectClf(pathToVectFolder, pathToClfModel, pathToWord2VecPretrained, path_to_PreprocessedDataset, \
                   path_to_nonPreprocessedDataset)
obj.generatePredWord2VecPretrained(path_to_nonPreprocessedDataset, path_to_PreprocessedDataset, pathToVectFolder,\
                                 pathToClfModel, path2word2vecModel, pathToWord2VecPretrained)

++++++++++++++++++++++++++++++++++++++++
Bigrams_WRul_CountVect_Freq-2959_GB
Bigrams_WRul_CountVect_Freq-2959_LR
Bigrams_WRul_CountVect_Freq-2959_MLP
Bigrams_WRul_CountVect_Freq-2959_NB
Bigrams_WRul_CountVect_Freq-2959_RF
Bigrams_WRul_CountVect_Freq-2959_SVM
Bigrams_WRul_CountVect_Freq-2959_DT
++++++++++++++++++++++++++++++++++++++++
BiTrigrams_WRul_CountVect_Freq-6502_DT
BiTrigrams_WRul_CountVect_Freq-6502_GB
BiTrigrams_WRul_CountVect_Freq-6502_LR
BiTrigrams_WRul_CountVect_Freq-6502_MLP
BiTrigrams_WRul_CountVect_Freq-6502_NB
BiTrigrams_WRul_CountVect_Freq-6502_RF
BiTrigrams_WRul_CountVect_Freq-6502_SVM
++++++++++++++++++++++++++++++++++++++++
Trigrams_WRul_CountVect_Freq-3561_DT
Trigrams_WRul_CountVect_Freq-3561_GB
Trigrams_WRul_CountVect_Freq-3561_LR
Trigrams_WRul_CountVect_Freq-3561_MLP
Trigrams_WRul_CountVect_Freq-3561_NB
Trigrams_WRul_CountVect_Freq-3561_RF
Trigrams_WRul_CountVect_Freq-3561_SVM
++++++++++++++++++++++++++++++++++++++++
UniAndBigrams_WRul_CountVect_Freq-4215_GB
UniA

In [None]:
df.iloc[:, 4:].shape

(500, 238)

1                  130
0                  108
2354                 1
2.66e+17             1
sandy hurricane      1
request              1
Name: 1, dtype: int64

In [None]:
def get_max_from_rows_into_column(df, indexes):
    """ The function takes dataframe and index of columns, and 
    then it returns the array containg the most frequent value against each row.
    Example: 
    dataframe:
    'c1' | 'c2'| 'c3' | 'c4'
    -------------------------
      1  |  1  |   1  |  0  
      2  |  1  |   2  |  2
      2  |  3  |   3  |  0 
      2  |  0  |   0  |  0 
      returned array:
      (1, 2, 3, 0)
      df: dataframe
      indexes: indexes of columns
      example: get_max_from_rows_into_column(dataframe, -3:)
      """
    def get_array(df):
        max_unique_label_column = np.empty([0])
        for v in df.values:
            unique_label, freq_label = np.unique(v, return_counts=True)
            # print(unique_label[np.argmax(freq_label)])
            max_unique_label_column = np.append(max_unique_label_column, unique_label[np.argmax(freq_label)])
            
        return max_unique_label_column
    if ":" in indexes:
        if indexes[0] == ":":
            
            df = df.iloc[:, :int(indexes[1:])]
            max_unique_label_column = get_array(df)
        if indexes[-1] == ":":
            df = df.iloc[:, int(indexes[:-1]):]
            max_unique_label_column = get_array(df)
           
        else:
            indexes = [int(x) for x in indexes.split(":")]
            df = df.iloc[:, indexes[0]: indexes[1]]
            max_unique_label_column = get_array(df)
    else:
            indexes = [int(x) for x in indexes.split(",")]
            df = df.iloc[:, indexes]
            max_unique_label_column = get_array(df)
    return max_unique_label_column.astype(int)
path = "500_random_sample_withPredictions_withPredictions_withPredictions.csv"
df = pd.read_csv(path,  encoding = "ISO-8859-1")

indexes = "4:"
df["tweet_class"] = get_max_from_rows_into_column(df, indexes)
df = df[["tweetID", "tweet_text", "tweet_class"]]

In [None]:
path = "500_random_sample_withPredictions_withPredictions_withPredictions.csv"
dff = pd.read_csv(path,  encoding = "ISO-8859-1")
i = 01
c = dff.iloc[i, :]

if c.value_counts()[0] > c.value_counts()[1]:
    print(0, ":", c.value_counts()[0])ounts()[0]:
    print(1, ":", c.value_counts()[1])
    rint(df["tweet_class"][i] == 1)
df["tweet_class"][0] 

0 : 122
True
1 : 130
True
1 : 192
True
0 : 125
True
0 : 181
True
1 : 221
True
1 : 225
True
1 : 167
True
0 : 149
True
1 : 179
True
1 : 221
True
0 : 196
True
1 : 215
True
0 : 207
True
1 : 193
True
1 : 199
True
0 : 153
True
0 : 204
True
0 : 166
True
1 : 215
True
1 : 133
True
1 : 223
True
1 : 203
True
1 : 136
True
1 : 204
True
0 : 198
True
1 : 229
True
1 : 199
True
1 : 168
True
1 : 217
True
0 : 207
True
1 : 197
True
0 : 142
True
1 : 214
True
1 : 199
True
1 : 176
True
0 : 206
True
1 : 124
True
1 : 224
True
1 : 193
True
1 : 199
True
1 : 216
True
1 : 198
True
1 : 217
True
1 : 122
True
1 : 196
True
0 : 168
True
0 : 149
True
0 : 170
True
0 : 191
True
0 : 178
True
1 : 120
True
1 : 176
True
0 : 150
True
1 : 231
True
1 : 203
True
1 : 229
True
1 : 233
True
1 : 225
True
1 : 179
True
0 : 131
True
0 : 154
True
1 : 190
True
0 : 171
True
0 : 193
True
0 : 194
True
1 : 160
True
0 : 174
True
1 : 207
True
0 : 206
True
0 : 204
True
1 : 221
True
0 : 144
True
1 : 232
True
0 : 158
True
1 : 209
True
1 : 213
True

(0, 122)

In [None]:
[22-05-06_22-56-41_Vect_WoRul_Max-Len-Seq-26_Freq-300_22-05-06_22-56-41_word2vec_Vect_WoRul_Max-Len-Seq-26_Freq-300]
==
['22-05-06', '22-56-41', 'word2vec', 'Model', 'WoRul', 'Max-Len-Seq-26', 'Freq-300']