# Data Pre Processing

### Importing libraries

In [9]:
from datasets import load_dataset
import datasets
import pandas as pd
import numpy as np
import re
import os

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from langdetect import detect

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import sys
import importlib

from transformers import BlipProcessor, BlipForConditionalGeneration
import torch



In [3]:
current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
src_data_directory = os.path.join(parent_directory, "src", "data")


sys.path.append(src_data_directory)




In [7]:
import data_cleaner
importlib.reload(data_cleaner)

<module 'data_cleaner' from '/Users/psinha/Desktop/Structred Folder/src/data/data_cleaner.py'>

### Load the dataset

In [None]:
# We can load our data from hugging face library

train_data = load_dataset("AI4Math/MathVista", split="test")




### Initialize Mac GPU

In [91]:

if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")


tensor([1.], device='mps:0')


### Call Blip model fro transfer learning

In [93]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
model.to(mps_device)

### Image Processor and caption generator

In [None]:


def BLIP_Caption_Preprocessor_And_Generator(image_data):

    raw_image = image_data.convert('RGB')                                                               # Convert to RGB format
    raw_image_tensor = torch.tensor(np.array(raw_image)).permute(2, 0, 1).unsqueeze(0).to(mps_device)   # make into torch tensor to push into mps device



    inputs = processor(raw_image_tensor, return_tensors="pt")                                           # processor does some additional processing for inputs specefic to model                                 
    inputs = {key: tensor.to(mps_device) for key, tensor in inputs.items()}                             # Transfer inputs to mps_device

    out = model.generate(**inputs)                                                                      # Generate captions
    return (processor.decode(out[0], skip_special_tokens=True))                                         # Return decoded values


### Initialize a data frame

In [None]:
# Create an empty DataFrame
df = pd.DataFrame(train_data)



### Removing non-english rows

In [None]:
# We can use the function made by us to remove non english rows


# Apply language detection to each text entry in the DataFrame
df['language'] = df['questions'].apply(data_cleaner.detect_language)

# Filter out non-English entries
df = df[df['language'] == 'en'].reset_index(drop=True)

# Drop the language column as it's no longer needed
df.drop(columns=['language'], inplace=True)





### Make processed text as a columns in the data frame

In [None]:
# Add the 'preprocessed_text' column
df['preprocessed_text'] = ""

### We can save captions from our dataset into data frames in chunks

In [None]:
def chunk_trainer(df, chunk_size, chunk_number, train_data):

    """
        This is used ot train data in smaller chunks
        The return type will be dataframe
    """
    start_index = chunk_number * chunk_size                                                 # Starting address of Chunk.
    end_index = min((chunk_number + 1) * chunk_size, len(train_data))                       # If number end of data frame comes before chunk end.

    for i in range(start_index, end_index):
        example = train_data[i]
        text = example['question']                                                          # Obtaining the question 
        caption = BLIP_Caption_Preprocessor_And_Generator(example['decoded_image'])
        caption += text                                                                     # Combining Captions and text
        preprocessed_text = data_cleaner.preprocess_text(caption)                           # Adding to our processed text columns

        # Update the preprocessed_text column in the DataFrame
        df.at[i, 'preprocessed_text'] = preprocessed_text                                   # Adding to the corresponding row number

        print ("Processed Image : ",i)

 



        print(f"Processed chunk {chunk_number}")

        return df


In [None]:
df  = chunk_trainer(df, 500, 0, train_data)

In [None]:
df  = chunk_trainer(df, 500, 1, train_data)

In [None]:
df  = chunk_trainer(df, 500, 2, train_data)

In [None]:
df  = chunk_trainer(df, 500, 3, train_data)

In [None]:
df  = chunk_trainer(df, 500, 4, train_data)

In [None]:
df  = chunk_trainer(df, 500, 5, train_data)

In [None]:
df  = chunk_trainer(df, 500, 6, train_data)

In [None]:
df  = chunk_trainer(df, 500, 7, train_data)

In [None]:
df  = chunk_trainer(df, 500, 8, train_data)

In [None]:
df  = chunk_trainer(df, 500, 9, train_data)

### Saving in csv format

In [None]:
df.to_csv('df_name.csv')