In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import string
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from skmultilearn.problem_transform import ClassifierChain
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier,LogisticRegression
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
%matplotlib inline
from sklearn import tree
import matplotlib
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

from keras.models import Sequential
from keras import layers
nltk.download('stopwords')

In [None]:
dfref =  pd.read_csv(r'../input/new-data/FR-Dataset.csv')

In [None]:

dfref.dropna(subset = ["Text"], inplace=True)
dfref = dfref.drop_duplicates(subset=['Text'])
dfref.describe()

In [None]:
def createArrayOfLabels(df):
    mlb = MultiLabelBinarizer()
    tempdf = pd.DataFrame(columns=['labels'])
    for i in df:
        temp = []
        try:
            i = i.replace(' ','')
            for j in i.split(','):
                if j != '':
                    temp.append(j.strip())
        except:
            pass
        tempdf = tempdf.append(pd.DataFrame({'labels': [temp]}))
        
    tempdf.apply(lambda x: tuple(x.values))
    mlb.fit(tempdf['labels'])
    tempdf = mlb.transform(tempdf['labels'])
    tempdf = pd.DataFrame(tempdf,columns =list(mlb.classes_) )
    return tempdf




In [None]:
def preprocess(x):
    try:
        stop_words = stopwords.words('english')
        lemmatizer = WordNetLemmatizer() 
        x = x.lower()
        x = x.translate(str.maketrans('', '', string.punctuation))
        x = x.split()
        x = [word for word in x if word not in stop_words]
        x = [lemmatizer.lemmatize(word) for word in x]
        x = str(x).replace(',',' ').replace("'","")[1:-1]
        return x
    except:
        print(f'There is an error in {x}')
        return 'empty'
    
    

In [None]:
Class = createArrayOfLabels(dfref['Class'])
dfref = dfref.reset_index(drop=True)
tocsv = pd.concat([dfref, Class], axis = 1)

In [None]:

tocsv.dropna(subset = ["Text"], inplace=True)
tocsv = tocsv.drop(columns=['ID', 'Class'])
tocsv['Text'] = tocsv['Text'].apply(preprocess)
tocsv.to_csv(r'C:Fr2.csv')

In [None]:
v = TfidfVectorizer(max_features=2500)
x = v.fit_transform(tocsv['Text'])

In [None]:
tocsv = pd.concat([pd.DataFrame(x.toarray()), tocsv], axis = 1)
print(tocsv.shape)

In [None]:
columns = list(tocsv.columns)
X_train,X_test, y_train , y_test  = train_test_split(tocsv[columns[:-4]],tocsv[columns[-4:]], test_size=0.25,random_state=42)


X_train = X_train.drop(columns=['Text'])
X_test = X_test.drop(columns=['Text'])

In [None]:
classifier = ClassifierChain(RandomForestClassifier(class_weight='balanced'))
classifier.fit(X_train, y_train)
print(X_test.shape)
predictions = classifier.predict(X_test)
print("Accuracy = ",accuracy_score(y_test,predictions))
print(classification_report(y_test,predictions))


In [None]:
classifier = ClassifierChain(MultinomialNB())
classifier.fit(X_train, y_train)
target_names = ['Extract', 'Move', 'Inline' ,'Rename']
predictions = classifier.predict(X_test)
print(classification_report(y_test,predictions,target_names=target_names))


In [None]:
classifier = RandomForestClassifier(class_weight='balanced')
classifier.fit(X_train, y_train)
print(X_test.shape)
predictions = classifier.predict(X_test)
print("Accuracy = ",accuracy_score(y_test,predictions))
print(classification_report(y_test,predictions))


In [None]:
import numpy as np
import pandas as pd
import random
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors



def get_tail_label(df):
    """
    Give tail label colums of the given target dataframe

    args
    df: pandas.DataFrame, target label df whose tail label has to identified

    return
    tail_label: list, a list containing column name of all the tail label
    """
    columns = df.columns
    n = len(columns)
    irpl = np.zeros(n)
    for column in range(n):
        irpl[column] = df[columns[column]].value_counts()[1]
    irpl = max(irpl)/irpl
    mir = np.average(irpl)
    tail_label = []
    for i in range(n):
        if irpl[i] > mir:
            tail_label.append(columns[i])
    return tail_label

def get_index(df):
    """
    give the index of all tail_label rows
    args
    df: pandas.DataFrame, target label df from which index for tail label has to identified

    return
    index: list, a list containing index number of all the tail label
    """
    tail_labels = get_tail_label(df)
    index = set()
    for tail_label in tail_labels:
        sub_index = set(df[df[tail_label]==1].index)
        index = index.union(sub_index)
    return list(index)

def get_minority_instace(X, y):
    """
    Give minority dataframe containing all the tail labels

    args
    X: pandas.DataFrame, the feature vector dataframe
    y: pandas.DataFrame, the target vector dataframe

    return
    X_sub: pandas.DataFrame, the feature vector minority dataframe
    y_sub: pandas.DataFrame, the target vector minority dataframe
    """
    index = get_index(y)
    X_sub = X[X.index.isin(index)].reset_index(drop = True)
    y_sub = y[y.index.isin(index)].reset_index(drop = True)
    return X_sub, y_sub

def nearest_neighbour(X):
    """
    Give index of 5 nearest neighbor of all the instance

    args
    X: np.array, array whose nearest neighbor has to find

    return
    indices: list of list, index of 5 NN of each element in X
    """
    nbs=NearestNeighbors(n_neighbors=5,metric='euclidean',algorithm='kd_tree').fit(X)
    euclidean,indices= nbs.kneighbors(X)
    return indices

def MLSMOTE(X,y, n_sample):
    """
    Give the augmented data using MLSMOTE algorithm

    args
    X: pandas.DataFrame, input vector DataFrame
    y: pandas.DataFrame, feature vector dataframe
    n_sample: int, number of newly generated sample

    return
    new_X: pandas.DataFrame, augmented feature vector data
    target: pandas.DataFrame, augmented target vector data
    """
    indices2 = nearest_neighbour(X)
    n = len(indices2)
    new_X = np.zeros((n_sample, X.shape[1]))
    target = np.zeros((n_sample, y.shape[1]))
    for i in range(n_sample):
        reference = random.randint(0,n-1)
        neighbour = random.choice(indices2[reference,1:])
        all_point = indices2[reference]
        nn_df = y[y.index.isin(all_point)]
        ser = nn_df.sum(axis = 0, skipna = True)
        target[i] = np.array([1 if val>2 else 0 for val in ser])
        ratio = random.random()
        gap = X.loc[reference,:] - X.loc[neighbour,:]
        new_X[i] = np.array(X.loc[reference,:] + ratio * gap)
    new_X = pd.DataFrame(new_X, columns=X.columns)
    target = pd.DataFrame(target, columns=y.columns)
    new_X = pd.concat([X, new_X], axis=0)
    target = pd.concat([y, target], axis=0)
    return new_X, target


In [None]:
    X_sub, y_sub = get_minority_instace(X_train, y_train)   #Getting minority instance of that datframe
    X_res,y_res =MLSMOTE(X_sub, y_sub, 1000)     #Applying MLSMOTE to augment the dataframe
    print(y_res.sum(),y_train.sum())

In [None]:
classifier = ClassifierChain(RandomForestClassifier(class_weight='balanced'))
classifier.fit(X_res, y_res)

predictions = classifier.predict(X_test)
print(classification_report(y_test,predictions))
