<a href="https://colab.research.google.com/github/sksoumik/programming_notes/blob/main/Code_Snippets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Basic library imports for colab




In [None]:
# mount gdrive
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np

import warnings
warnings.filterwarnings("ignore")

# force output to display the full text 
pd.set_option('display.max_colwidth', -1)

import tensorflow as tf
import torch 


# device_name = tf.test.gpu_device_name()
# if device_name != '/device:GPU:0':
#   raise SystemError('GPU device not found')

# print('Found GPU at: {}'.format(device_name))
# print(torch.cuda.get_device_name(0))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Filter dataframe rows with string lenght 

In [None]:
df = df[df['text'].str.split().str.len() > 10]

## Text data cleaning methods, nlp preprocessing


In [None]:
import emoji
import re
import html
import unicodedata
import unidecode


def clean(text):
    text = standardize_text(text)
    text = replace_url(text)
    text = replace_mail_or_mentions(text)
    text = process_emojis(text)
    return " ".join(text.split())


def standardize_text(text):
    """
    1) Escape HTML
    2) Replaces some non-standard punctuation with standard versions.
    3) Replace \r, \n and \t with white spaces
    4) Removes all other control characters and the NULL byte
    5) Removes duplicate white spaces
    """
    # escape HTML symbols
    text = html.unescape(text)
    # standardize punctuation
    # translate table for punctuation
    transl_table = dict([(ord(x), ord(y))
                         for x, y in zip(u"‘’´“”–-",  u"'''\"\"--")])
    text = text.translate(transl_table)
    text = text.replace('…', '...')
    # replace \t, \n and \r characters by a whitespace
    control_char_regex = re.compile(r'[\r\n\t]+')
    text = re.sub(control_char_regex, ' ', text)
    # remove all remaining control characters
    text = ''.join(ch for ch in text if unicodedata.category(ch)[0] != 'C')
    # replace multiple spaces with single space
    text = ' '.join(text.split())
    return text.strip()


def standardize_punctuation(text):
    return ''.join([unidecode.unidecode(t) if unicodedata.category(t)[0] == 'P' else t for t in text])


def replace_url(text, replace_token=""):
    url_regex = re.compile(r'((www.\S+)|(https?://\S+))')
    text = re.sub(url_regex, replace_token, text)
    return text


def process_emojis(text, replace_token="", remove=False,):
    def remove_emojis(text, replace_token=""):
        text = "".join(
            [c if c not in emoji.UNICODE_EMOJI else replace_token for c in text])
        return text

    def asciify_emojis(text):
        """Convert emojis into text aliases.
        E.g. 👍 -> :thumbs_up: -> :thumbsup:
        """
        text = emoji.demojize(text)
        emojis = re.findall(r'\:(.*?)\:', text)
        for e in emojis:
            text = text.replace(e, ''.join(e.split('_')))
        return text

    if remove:
        return remove_emojis(text, replace_token="")
    return asciify_emojis(text)


def replace_mail_or_mentions(text, replace_token=""):
    text = re.sub(r'\S*@\S*\s?', replace_token, text)
    return text

##########################################################################
######## Other options that might be helpful to preprocess ###############
##########################################################################

# remove all rows that contain any non-ascii characters

df['text'] = df[~df.text.str.contains(r'[^\x00-\x7F]', na=False)]

# remove nan and duplicates 
df = df[df['text'].notnull()]
df.drop_duplicates(keep=False, inplace=True)

# remove URLs
def remove_url(row):
    new_text = re.sub(r'http\S+', '', row)
    return new_text


# remove contents that are inside <*>

def remove_contents_in_brace(row):
    new_text = re.sub(r'<.*>', '', row)
    return new_text 


# remove double spaces 
def remove_double_space(sentence): 
    new_sentence = " ".join(sentence.split())
    return new_sentence
  

  # remove `, ` from the beginning of a sentence. 
def remove_leading_punc(row):
    row = row[2:] if row.startswith(', ') else row
    return row 

  
# remove all special characters - punctuations 
df['text'] = df['text'].str.replace(r'[^\w\s]+', '')
  
  
# remove digits
import string

df['text'] = df['text'].str.rstrip(string.digits)


# remove words that contain number and character both: IDs
# for example: U017Q2N13J 
def remove_numbers(words):
    new_text = re.sub(r'\w*\d\w*', '', words).strip()
    return new_text


## Remove null and duplicates from pandas

In [None]:
# remove nan and duplicates 
df = df[df['text'].notnull()]
df.drop_duplicates(keep=False, inplace=True)

## Average sentence length in a column of dataframe

In [None]:
print('average sentence length: ', df.Text.str.split().str.len().mean())
print('stdev sentence length: ', df.Text.str.split().str.len().std())

## Display all images in a python List

In [None]:
import cv2 as cv 
import glob
import matplotlib.pyplot as plt 


path = "static/subfolder/*/*.jpg"

my_image_list = []

for file in glob.glob(path):
    file = cv.imread(file) # BGR
    # convert BGR to RGB 
    rgb_image = cv.cvtColor(file, cv.COLOR_BGR2RGB)
    my_image_list.append(rgb_image)

# display all images 
plt.figure(figsize=(20,10))

columns = 4

for i, image in enumerate(my_image_list):
    plt.subplot(len(my_image_list) / columns + 1, columns, i + 1)
    plt.imshow(image)

## Plot number of values distribution in a column of dataframe

In [None]:
data['target_column'].value_counts().plot.bar();

## Balancing class distribution of a dataset: Undersampling

In [None]:
def sampling_k_elements(group, k=3):
if len(group) < k:
    return group
return group.sample(k)

balanced = df.groupby('class').apply(sampling_k_elements).reset_index(drop=True)

## Save the print output in a txt file

In [None]:
import sys

# put your output data container
print(report)

original_stdout = sys.stdout

with open("classification_report.txt", "w") as f:
    sys.stdout = f
    # put your output data container again
    print(report)
    sys.stdout = original_stdout

## Mapping one list to another: adding two lists together

In [None]:
"""
categories: List[str]
category_ids: List[int]
"""

label_details = list(map(lambda x, y: x+ ':' +str(y), categories, category_ids))

## Balancing class distribution is train test split

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(data,
                                    stratify=data['class_id'], 
                                    test_size=0.20)

## Evaluation report generation with true label and predicted labels

In [None]:
from tqdm import tqdm

# read evaluation data
evaluation_df = pd.read_csv("path/to/evaluation/file.csv") 

true_label = list(evaluation_df['class_id'])

evaluation_texts = []

for i in evaluation_df["Text"]:
    evaluation_texts.append(i)


# evaluation scores using ml model
prediction_probs = []

for i in tqdm(evaluation_texts):
    preds, _ = model.predict(evaluation_texts) # predict function, must return a score 
    prediction_probs.append(preds) 

# make a new dataframe using the target texts and prediction probabilities
prediction_df = pd.DataFrame(
    {
        "Comment": evaluation_texts,
        'true labels': true_label,
        'Prediction' : prediction_probs,
    }
)

# save the prediction df 
prediction_df.to_csv("save/path/filename.csv", index=False)

## scikit learn confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix


true_label = list(evaluation_df['true labels'])
predicted_labels = list(evaluation_df['predicted labels'])

# y_test = true_label
# y_pred = predicted_labels

matrix = confusion_matrix(true_label, predicted_labels)
matrix

## Plot confusion matrix

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


"""
Below code has been adapted from: https://github.com/DTrimarchi10/confusion_matrix
"""
def make_confusion_matrix(cf,
                          group_names=None,
                          categories='auto',
                          count=True,
                          percent=True,
                          cbar=True,
                          xyticks=True,
                          xyplotlabels=True,
                          sum_stats=True,
                          figsize=None,
                          cmap='Blues',
                          title=None):


    # CODE TO GENERATE TEXT INSIDE EACH SQUARE
    blanks = ['' for i in range(cf.size)]

    if group_names and len(group_names)==cf.size:
        group_labels = ["{}\n".format(value) for value in group_names]
    else:
        group_labels = blanks

    if count:
        group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
    else:
        group_counts = blanks

    if percent:
        group_percentages = ["{0:.2%}".format(value) for value in cf.flatten()/np.sum(cf)]
    else:
        group_percentages = blanks

    box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels,group_counts,group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])


    # CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS
    if sum_stats:
        #Accuracy is sum of diagonal divided by total observations
        accuracy  = np.trace(cf) / float(np.sum(cf))

        #if it is a binary confusion matrix, show some more stats
        if len(cf)==2:
            #Metrics for Binary Confusion Matrices
            precision = cf[1,1] / sum(cf[:,1])
            recall    = cf[1,1] / sum(cf[1,:])
            f1_score  = 2*precision*recall / (precision + recall)
            stats_text = "\n\nAccuracy={:0.3f}\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={:0.3f}".format(
                accuracy,precision,recall,f1_score)
        else:
            stats_text = "\n\nAccuracy={:0.3f}".format(accuracy)
    else:
        stats_text = ""


    # SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
    if figsize==None:
        #Get default figure size if not set
        figsize = plt.rcParams.get('figure.figsize')

    if xyticks==False:
        #Do not show categories if xyticks is False
        categories=False


    # MAKE THE HEATMAP VISUALIZATION
    plt.figure(figsize=figsize)
    sns.heatmap(cf,annot=box_labels,fmt="",cmap=cmap,cbar=cbar,xticklabels=categories,yticklabels=categories)

    if xyplotlabels:
        plt.ylabel('True label')
        plt.xlabel('Predicted label' + stats_text)
    else:
        plt.xlabel(stats_text)
    
    if title:
        plt.title(title)



# call the function
labels = ['True Neg','False Pos','False Neg','True Pos']
categories = ['label_name (0)', 'label_name (1)', 'label_name (2)', 'label_name (3)', 'label_name (4)', 'label_name (5)']
make_confusion_matrix(matrix, 
                      group_names=labels,
                      categories=categories, 
                      cmap='binary')

## scikit learn classification report

In [None]:
from sklearn.metrics import classification_report

y_true = true_label
y_pred = predicted_labels
target_names = ['label_name (0)', 'label_name (1)', 'label_name (2)', 'label_name (3)', 'label_name (4)', 'label_name (5)']

print(classification_report(y_true, y_pred, target_names=target_names))

## Mount google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## rename columns

In [None]:
rename_cols = {
    0: 'class_id',
    1: 'comment'
}

df = df.rename(columns=rename_cols)

## Convert text classes to numeric values

In [None]:
'''
convert text classes to numeric value.
e.g. 
positive -> 0
negative -> 1
neutral -> 2 
etc.... 
'''


def encode_class(data):
    for i in range(len(data["class"].unique())):
        data.loc[data["class"] == data["class"].unique()[i], "class_id"] = i

    data["class_id"] = data["class_id"].astype("Int64")
    return data

## Undersample and class balancing to a dataframe

In [None]:
'''
Undersample and balance all classes' data points to avoid biasness. 
make all class's data points equal 
'''


def sampling_train_data(
    group,
    k=int(
        input(
            "Enter the amount of data that you want for each class for train set: "
        ))):

    if len(group) < k:
        return group
    return group.sample(k)


def sampling_test_data(
    group,
    k=int(
        input(
            "Enter the amount of data that you want for each class for test set: "
        ))):

    if len(group) < k:
        return group
    return group.sample(k)


def balance_data(data):
    train_data = data[data["train-test"] == "train"]
    test_data = data[data["train-test"] == "test"]
    train_df = train_data[["feature", "class_id", "class"]]
    test_df = test_data[["feature", "class_id", "class"]]
    train_df = train_df.groupby('class_id').apply(
        sampling_train_data).reset_index(drop=True)
    test_df = test_df.groupby('class_id').apply(
        sampling_test_data).reset_index(drop=True)
    # save train and test data to disk
    os.makedirs("data", exist_ok=True)
    train_df.to_csv("data/train_data.csv", index=False)
    test_df.to_csv("data/test_data.csv", index=False)


## simpletransformers installation dependencies

In [None]:
!pip install torch===1.2.0 torchvision===0.4.0 -f https://download.pytorch.org/whl/torch_stable.html
!pip install transformers==2.11.0
!pip install simpletransformers==0.41.1
!git clone --recursive https://github.com/NVIDIA/apex.git
!cd apex && pip install .

## most frequent value row wise

In [None]:
df.mode(axis=1)  

## multiple dataframe concate column wise

In [None]:
df_list = [df1, df2, df3]
new_df = pd.concat(df_list, axis=0)

## Remove empty/blank rows from dataframe

In [None]:
df['col_name'].replace('  ', np.nan, inplace=True)
df = df.dropna(subset=['col_name'])

## multi label target list conversion from normal columns

In [None]:
cols = ['col_name_1','col_name_2']
train_df['Labels'] = train_df[cols].values.tolist()

'''

col_name_1   col_name_2    Labels
-------------------------------------
   0            0           [0, 0]
   1            1           [1, 1] 

'''

## download data from drive 

In [None]:
! gdown --id 1njvYa1P3ZVCzCDusvOZV7Z_4P 

## int64 conversion of target values

In [None]:
df.loc[:, 'col_name'] = df['col_name'].astype(int)

## shuffle rows in pandas dataframe

In [None]:
df.sample(frac=1).reset_index(drop=True)

## text data augmentation nlpaug

In [None]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf
from nlpaug.util import Action


# parameter list:
# https://nlpaug.readthedocs.io/en/latest/augmenter/augmenter.html


TOPK=20 
ACT = 'insert'

aug_distilbert = naw.ContextualWordEmbsAug(
    model_path='distilbert-base-uncased', 
    #device='cuda',
    action=ACT, top_k=TOPK)


# keep the original data and create 3 augmented data using contexual embedding
def contexual_embedding(text):
    x = aug_distilbert.augment(text, n=3)
    x += [text]
    return x


# this will keep the label columns as well 
df["col_name"] = df["col_name"].apply(lambda x: contexual_embedding(x)) 

# explode data
dump_data = df.explode('col_name')

## filter column values based on given data

In [None]:
# removes all value rows other than 0/1 in col_1 and col_2
df = df[df[['col_1','col_2']].isin([0,1]).all(axis=1)]

## remove rows from one dataframe that exist in other dataframe based on index

In [None]:
df2_indicies = df2.index.values.tolist()
df1 = df1.drop(df1.index[df2_indicies])