## Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Read Captions

In [None]:
def read_half_of_text_file(file_path):
    sentences = []

    # Reading the file
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Calculating the midpoint of the file
    midpoint = len(lines) // 2

    # Reading only the first half of the file
    for line in lines[:midpoint]:
        # Stripping any leading/trailing whitespaces
        cleaned_line = line.strip()

        # Only add non-empty lines
        if cleaned_line:
            sentences.append(cleaned_line)

    return sentences

In [None]:
def read_all_of_text_file(file_path):
    sentences = []

    # Reading the file
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Processing all lines in the file
    for line in lines:
        # Stripping any leading/trailing whitespaces
        cleaned_line = line.strip()

        # Only add non-empty lines
        if cleaned_line:
            sentences.append(cleaned_line)

    return sentences


In [None]:
# Example usage
sentences = read_half_of_text_file('/content/drive/MyDrive/captions_output.txt')
print(len(sentences))

In [None]:
# Example usage
sentences = read_all_of_text_file('/content/drive/MyDrive/captions_output.txt')
print(len(sentences))

## ConcepNet

In [None]:
import json
import os
import glob
import pickle
import numpy as np
from sys import stdout
from collections import Counter
import pandas as pd
from scipy import spatial
from scipy.io import loadmat, savemat
from scipy.spatial import distance

In [None]:
#ConceptNet NumberBatch

def text_to_uri(text):
    """
    An extremely cut-down version of ConceptNet's `standardized_concept_uri`.
    Converts a term such as "apple" into its ConceptNet URI, "/c/en/apple".

    Only works for single English words, with no punctuation besides hyphens.
    """
    return '/c/en/' + text.lower().replace('-', '_')

def normalize_vec(vec):
    """
    Normalize a vector to a unit vector, so that dot products are cosine
    similarities.

    If it's the zero vector, leave it as is, so all its cosine similarities
    will be zero.
    """
    norm = vec.dot(vec) ** 0.5
    if norm == 0:
        return vec
    return vec / norm

class AttributeHeuristic:
    def __init__(self, hdf5_filename):
        """
        Load a word embedding matrix that is the 'mat' member of an HDF5 file,
        with UTF-8 labels for its rows.

        (This is the format that ConceptNet Numberbatch word embeddings use.)
        """
        self.embeddings = pd.read_hdf(hdf5_filename, 'mat', encoding='utf-8')
        self.cache = {}

    def get_vector(self, term):
        """
        Look up the vector for a term, returning it normalized to a unit vector.
        If the term is out-of-vocabulary, return a zero vector.

        Because many terms appear repeatedly in the data, cache the result.
        """
        uri = text_to_uri(term)
        if uri in self.cache:
            return self.cache[uri]
        else:
            try:
                vec = normalize_vec(self.embeddings.loc[uri])
            except KeyError:
                vec = pd.Series(index=self.embeddings.columns).fillna(0)
            self.cache[uri] = vec
            return vec

    def get_similarity(self, term1, term2):
        """
        Get the cosine similarity between the embeddings of two terms.
        """
        return self.get_vector(term1).dot(self.get_vector(term2))

    def compare_attributes(self, term1, term2, attribute):
        """
        Our heuristic for whether an attribute applies more to term1 than
        to term2: find the cosine similarity of each term with the
        attribute, and take the difference of the square roots of those
        similarities.
        """
        match1 = max(0, self.get_similarity(term1, attribute)) ** 0.5
        match2 = max(0, self.get_similarity(term2, attribute)) ** 0.5
        return match1 - match2

    def classify(self, term1, term2, attribute, threshold):
        """
        Convert the attribute heuristic into a yes-or-no decision, by testing
        whether the difference is larger than a given threshold.
        """
        return self.compare_attributes(term1, term2, attribute) > threshold

    def evaluate(self, semeval_filename, threshold):
        """
        Evaluate the heuristic on a file containing instances of this form:

            banjo,harmonica,stations,0
            mushroom,onions,stem,1

        Return the macro-averaged F1 score. (As in the task, we use macro-
        averaged F1 instead of raw accuracy, to avoid being misled by
        imbalanced classes.)
        """
        our_answers = []
        real_answers = []
        for line in open(semeval_filename, encoding='utf-8'):
            term1, term2, attribute, strval = line.rstrip().split(',')
            discriminative = bool(int(strval))
            real_answers.append(discriminative)
            our_answers.append(self.classify(term1, term2, attribute, threshold))

        return f1_score(real_answers, our_answers, average='macro')

In [None]:
heuristic = AttributeHeuristic('/content/drive/MyDrive/numberbatch-20180108-biased.h5')

## Get Visual Words Spacy

In [None]:
coco_labels = [
  'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat',
  'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat',
  'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
  'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
  'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
  'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
  'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
  'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse',
  'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator',
  'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]


In [None]:
import spacy
en_nlp = spacy.load('en_core_web_sm')

In [None]:
def do_spacy(sentence: str) -> list:
    sentence = sentence.lower()
    doc = en_nlp(sentence)
    visual_words = []
    visual_deps = {'amod', 'oprd', 'dobj', 'attr', 'nsubj', 'pobj'}
    for token in doc:
        if (token.dep_ in visual_deps and token.pos_ in ('NOUN', 'VERB', 'ADJ')) or token.text in coco_labels:
            visual_words.append(token.text)
    return visual_words


In [None]:
def do_spacy(sentence: str) -> list:
    sentence = sentence.lower()
    doc = en_nlp(sentence)
    visual_words = []
    visual_deps = {'amod', 'oprd', 'dobj', 'attr', 'nsubj', 'pobj'}
    for token in doc:
        if (token.dep_ in visual_deps and token.pos_ in ('NOUN', 'VERB', 'ADJ')) or token.text in coco_labels:
            visual_words.append(f"({token.dep_}, {token.text}, {token.head.text})")
            if token.dep_ == 'dobj':
                for child in token.children:
                    if child.dep_ == 'amod':
                        visual_words.append(f"({child.dep_}, {child.text}, {token.text})")
                    if child.dep_ == 'det':
                        visual_words.append(f"({child.dep_}, {child.text}, {token.text})")
    return visual_words



In [None]:
# Example usage
sentence = "A bearded, bald man wears a multicolored tie"
visual_words = do_spacy(sentence)
print(visual_words)

In [None]:
visual_words

In [None]:
for item in visual_words:
    # Remove parentheses and split the string by comma
    parts = item.strip("()").split(", ")
    if len(parts) == 3:
        rel, word1, word2 = parts  # Unpack the parts
        print(rel,word1,word2)

In [None]:
# Example usage
sentence = "A black dog ."
visual_words = do_spacy(sentence)
print(visual_words)

## Save the Vectors

In [None]:
mydic={
1:['person','doll','girl','poet','character','enemy','individual','people','single','human','baby','stranger','farmer','captain','name','men','doctor','woman','owner','native','friend','judge','man','chief','member','merchant','agent','body','servant','guard','statue','boy','one people','me','neighbour','fellow','being','child','boys'],
2:['bicycle','cyclocross','step','hirondelle','biker','cyclocomputer','biciklo','hobby','horse','rower','wielrenner','pogo','monocycle','hitch rack','groupset','bicikal','daawheeyl','rota','exercise bicycle','velosiped','saikul','pneumatic','mount','donk','bicycle touring','velodrome','bidon','bmx'],
3:['car','hybrid','four','wheels','wheel','vehicle','wax','accord','transportation', 'seat','passenger', 'motor','ride','automobil','turn','drive','truck','auto','automobile','engine','van','oil','exhaust','brake','automobilski','gas','four wheels','kola','driver','road','driving','motor vehicle','automobilist'],
4:['motorcycle','motosikal','low side','smoker','touring motorcycle','dress for slide','scooter' ,'combination','pillion','monkey','chook chaser','moto','motocikl','motocycle','motorcycling','motomondiale','lay down','motorrad','gluaisrothar','ducati','sissy bar','bicycle'],
5:['airplane','rapid unplanned' ,'disassembly','aerobatics', 'jet', 'paper airplane','aeroplanum','ditch','biplane','repl','pako','wing','letoun','lidmana','messerschmitt','kiebitz','lentz','aeradio','vliegtoestel','heavier than air','avion','avionski','clipper','abyon','kapal terbang','aeroplan','monoplane','737','toestel'],
6:['bus','autobus','buss','knifeboard','bus station','transport','kolodvor','passenger','corriera','remains','headway','zais','component','seka auto','balai','busless','autobusov','kneeling bus','nysse','bondi','sabirnica','ambulance','sakayan','ekspressbuss','omnibus','double-decker','jitney','bus_topology','coach','motorcoach','charabanc','autobus','motorbus','passenger vehicle','busbar','heap','jalopy'],
7:['train','school', 'civilise','rail','gear','railroad_train','prepare','take aim','educate','wagon_train','string','take','caravan','trail','check','train','direct','cultivate','geartrain','power train','aim','civilize','coach','gearing','discipline','condition','groom','develop'],
8:['truck','sleeper','cab','lorry','chidtsoh','truckyard','wheel','vehicle','knuckle under','run out','kamion','camin','caro','roach coach','truck' ' garden', ' van', ' trucking', ' artic', ' platform' ' crane', ' camionero', ' cart', ' cisterna', ' halyard', ' truck driver','gun carriage','rustbucket','retarder','weigh station','camioncillo','truckless','cami','kamionski','defraud'],
9:['boat','sailing','vessel','sail','sails','floating vessel','yacht','water vehicle','vehicle','water','transport','transportation','small ship','passenger','barica','gondola','sailing vessel','ship','craft','water vessel','ocean','floating','water craft','rudder','captain','ark','watercraft','rib','small','sea','barka','canoe','ferry','amac'],
10:['traffic light'],
11:['fire hydrant'],
13:['stop sign'],
14:['parking meter'],
15:['bench','terrace','judiciary','Bench','workbench','work_bench'],
16:['bird','chicken','duck','wings','animal','winged','feathers','flying animal','nest','dove','egg','flying','quail','creature','feathered','chick','canary','bat','pigeon','owl'],
17:['cat',' meow','kitty','flea','feline','kitten','maca','cats','mexican hairless',' house',' pet'],
18:['dog','woof','tail','cat','flea',' wolf','bark','canine','friend','pet'],
19:['horse','ogat','sivac','hackney',' krkalo','race','equine','legs riding','jockey','saddle four legs','centaur','cow','mammal','hooves','racing','tail','hevonen','erav','pony','large','racing animal','struna','mane'],
20:['sheep','ram','shepherd','baa','lamb','wool animal','brabonjak','ewe','goat','wooly animal','woolly','baa baa','mutton','wooly','adult lamb','flock','cotton','lambs'],
21:['cow','ox','calf','beef','bovine animal','bovine','cattle','moo', 'moo animal'],
22:['elephant','fil','elevant','slonovski','elephants'],
23:['bear','grizzly','forest','large','brown'],
24:['zebra','zeedonk','zbre','zeal','stripe','zebras'],
25:['giraffe','camelopard','giraffe','Giraffa camelopardalis'],
27:['backpack','back_pack','knapsack','rucksack','backpack','packsack','pack','haversack'],
28:['umbrella','provide','brelly','sateenvarjo','ymbarl','purposes pandong','paraple'],
31:['handbag','bag','purse','pocketbook'],
32:['tie','connect','railroad tie','standoff','link_up', ' attach', ' sleeper', ' tie-in'],
33:['suitcase','travelling_bag','suitcase','grip','baggage','luggage'],
34:['frisbee','throw','pie','boomerang','turf','disk','record','glide'],
35:['skis','slope'],
36:['snowboard','droneboarding','lautailla','snowboarder','shredder','snowboards','ski'],
37:['sports ball'],
38:['kite','barcud','sail','frigi'],
39:['baseball bat'],
40:['baseball glove'],
41:['skateboard','freeboard','kingpin','hoverboard'],
42:['surfboard','skurf','longboard','surf','surf riding','boogieboard'],
43:['tennis racket'],
44:['bottle','bottleful','bottle','nursing bottle','feeding_bottle'],
46:['wine glass'],
47:['cup','drink','kup','soutien gorge','saucer','mug','pokal','glass','cups','kupa'],
48:['fork','ramification','ramify','branching','branch','separate','furcate','pitchfork','forking','crotch'],
49:['knife','edge','stick frog','tongue stab'],
50:['spoon','spoonful'],
51:['bowl','trough', 'arena','sports_stadium','stadium','bowling_ball','bowlful','pipe_bowl','roll'],
52:['banana','kentang', 'plantain', 'bann', 'banaaniplantaasi', 'mkungu', 'slip', 'saba'],
53:['apple','orchard apple tree', 'Malus pumila'],
54:['sandwich','ville', 'sendvi', 'kent'],
55:['orange','orangish','orangeness'],
56:['broccoli','parsakaali','raphanin','brokula','vegetable', 'brassicaceae','broccolilike' ,'calabrese'],
57:['carrot','cenoura','wortel'],
58:['hotdog'],
59:['pizza','pizzero','slice','pica','pizza margherita','pizzetta','puff','pizzaiola'],
60:['donut','sinker','doughnut','munk'],
61:['cake','round','sweet','baked','bakery',' birthday candles','desert', 'pie','birthday pastry'],
62:['chair','sitting','bedchair','chaire','sit', ' sitting device','seat'],
63:['couch','redact','sofa','frame','couch','lounge','cast','put'],
64:['potted plant'],
65:['bed','sleep in','room','bedroom','blankets'],
67:['dining table'],
70:['toilet','soil pipe','tiwalet','wc', ' washroom'],
72:['tv','tlvision','nightline','rebroadcaster','transvestite','news','tlviseur'],
73:['laptop','desktop','computer','fartlva','portatile','skootrekenaar','porttil','snpur','compound','desktop computer','notebook'],
74:['mouse','shiner', 'mouse','pussyfoot','computer mouse','sneak','creep','black_eye'],
75:['remote','remote_control','remote','distant','removed'],
76:['keyboard','type','tastatura','tastatur'],
77:['cell phone','mobile','phone','cellular telephone','cellular phone','cell','cellular','mobile phone','cellular_tele'],
78:['microwave','zap', 'nuke','microwave oven','micro cook'],
79:['oven','jara','kitchen','appliance','cooking'],
80:['toaster','wassailer','bread','kitchen appliance'],
81:['sink','go_under', 'bury','slide_down','fall_off','subside',' sump'],
82:['refrigerator',' icebox','fridge','frigorific','refrigerator','frigor'],
84:['book', 'reading thing','tome','cover','library',' page','reading','prirunik','literature','novel','story','book','booklet','bookman','bookworm','volume','record','script','book of account','ledger','record book','rule book','scripture','text','textbook','word','work','book of facts','fact book','playscript'],
85:['clock','telling','wall watch','time','clock','clocking','clock time'],
86:['vase','vza','vaseful','vaasi','vase','flower vase','vaseful'],
87:['scissors','scissors','scissorsgrip','scissor_grip','scissor_hold','pair of scissors','scissors hold'],
88:['teddybear'],
89:['hairdrier'],
90:['toothbrush','hambahari','tutbras','znnbiischt','toothbrushes','tandbrste','teeth','brush','tannkost','brosse dents','tannbrste','caday','cleaning','electric toothbrush'],
}

In [None]:
def embed_visual_words(sentence: str, mydic: dict) -> dict:
    visual_words = do_spacy(sentence)
    print('visual words:',visual_words)
    embeddings_with_labels = {}

In [None]:
for sentence in sentences:
    embeddings = embed_visual_words(sentence, mydic)
    print(f"Sentence: {sentence}\nEmbeddings: {embeddings}\n")

In [None]:
def get_vector(word):
    # Placeholder for the heuristic.get_vector function
    # This function should return the word embedding vector
    return heuristic.get_vector(word)

In [None]:
import pickle
def process_sentences_in_batches(file_path, mydic, batch_size=10000, allowed_labels=[1, 2, 3,4 ,5]):
    # Read the first half of the text file
    sentences = read_all_of_text_file(file_path)

    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i + batch_size]
        results = []

        for sentence in batch_sentences:
            print(sentence)
            visual_words = do_spacy(sentence)
            embeddings_with_labels = {}
            print("Extracted visual words:", visual_words)  # Debug print

            for word in visual_words:
                vector = get_vector(word)
                label = next((key for key, value in mydic.items() if word.lower() in [v.lower() for v in value]), None)
                print(f"Word: '{word}', Label found: {label}")  # Debug print
                if label is not None and label in allowed_labels:
                    print(f'----------The {word} in allowdLabels---------')
                    embeddings_with_labels[tuple(vector)] = label

            if embeddings_with_labels:
                results.append(embeddings_with_labels)

        # Save the filtered embeddings in a pickle file
        pkl_filename = f'embeddings_batch_{i//batch_size + 1}.pkl'
        with open(pkl_filename, 'wb') as pkl_file:
            pickle.dump(results, pkl_file)
        print(f"Saved {pkl_filename} containing batch {i//batch_size + 1}")


In [None]:
file_path = '/content/drive/MyDrive/captions_output.txt'
process_sentences_in_batches(file_path, mydic)

In [None]:
# save it as dic

import pickle

def process_sentences(file_path, mydic, allowed_labels=[1, 2, 3, 4, 5]):
    # Read all sentences from the text file
    sentences = read_all_of_text_file(file_path)
    file_counter = 1  # Counter to name files uniquely

    for sentence in sentences:
        print(sentence)
        visual_words = do_spacy(sentence)
        embeddings_with_labels = {}
        print("Extracted visual words:", visual_words)  # Debug print

        for word in visual_words:
            vector = get_vector(word)
            label = next((key for key, value in mydic.items() if word.lower() in [v.lower() for v in value]), None)
            print(f"Word: '{word}', Label found: {label}")  # Debug print
            if label is not None and label in allowed_labels:
                print(f'----------The {word} in allowdLabels---------')
                embeddings_with_labels[tuple(vector)] = label

        if embeddings_with_labels:
            # Save the embeddings in a pickle file
            pkl_filename = f'embeddings_{file_counter}.pkl'
            with open(pkl_filename, 'wb') as pkl_file:
                pickle.dump(embeddings_with_labels, pkl_file)
            print(f"Saved {pkl_filename}")
            file_counter += 1


In [None]:
import pickle

# Step 2: Open the .pkl file in binary read mode
with open('/content/embeddings_batch_1.pkl', 'rb') as file:
    # Step 3: Load the data from the pickle file
    loaded_data = pickle.load(file)

# Step 4: Print or process the loaded data
print(loaded_data)


In [None]:
!cp '/content/embeddings_batch_10.pkl' '/content/drive/MyDrive/src_for_RunAndTest'

## Combine pkl file

In [None]:
import pickle

# Initialize an empty list to store combined data
combined_data = []

# Loop through each file and append its contents to combined_data
for i in range(1, 11):  # Assuming file indices from 1 to 10
    filename = f'/content/drive/MyDrive/src_for_RunAndTest/embeddings_batch_{i}.pkl'

    with open(filename, 'rb') as file:
        data = pickle.load(file)
        combined_data.extend(data)  # Use extend for lists

# Save the combined data back to a new .pkl file
with open('combined_embeddings.pkl', 'wb') as file:
    pickle.dump(combined_data, file)


## For evalute

In [None]:
def process_sentences(sentence=' ', allowed_labels=[1, 2, 3, 4, 5]):
    # Process the sentence with spacy or a similar tool to get visual words
    visual_words = do_spacy(sentence)
    embeddings_with_labels = {}
    print("Extracted visual words:", visual_words)  # Debug print

    for word in visual_words:
        vector = get_vector(word)
        label = next((key for key, value in mydic.items() if word.lower() in [v.lower() for v in value]), None)
        print(f"Word: '{word}', Label found: {label}")  # Debug print
        if label is not None and label in allowed_labels:
            print(f'----------The {word} in allowedLabels---------')
            # Store the word along with its vector and label
            embeddings_with_labels[word] = {'vector': vector, 'label': label}

    return embeddings_with_labels


In [None]:
import pandas as pd

def process_sentences(sentence=' ', allowed_labels=None):
    print("Processing sentence:", sentence)
    visual_words = do_spacy(sentence)
    print("Extracted visual words:", visual_words)

    data_for_csv = []

    for word in visual_words:
        vector = get_vector(word)
        label = next((key for key, value in mydic.items() if word.lower() in [v.lower() for v in value]), None)
        if label is not None:
            print(f"Word: '{word}', Label found: {label}")
            if allowed_labels is None or label in allowed_labels:
                print(f'Adding {word} with label {label} to CSV data')
                data_for_csv.append(list(vector) + [label])
        else:
            print(f"Label not found for '{word}'")

    # Create a DataFrame without column headers
    df = pd.DataFrame(data_for_csv)
    csv_file_name = 'output.csv'
    df.to_csv(csv_file_name, index=False, header=False)
    print(f"Data saved to {csv_file_name}")

    return df


### Last function

In [None]:
def process_file(file_path, allowed_labels=None):
    data_for_csv = []

    with open(file_path, 'r') as file:
        for sentence in file:
            sentence = sentence.strip()  # Remove leading/trailing whitespaces
            if sentence:  # Check if the sentence is not empty
                print("Processing sentence:", sentence)
                visual_words = do_spacy(sentence)
                print("Extracted visual words:", visual_words)

                for word in visual_words:
                    vector = get_vector(word)
                    label = next((key for key, value in mydic.items() if word.lower() in [v.lower() for v in value]), None)
                    if label is not None:
                        print(f"Word: '{word}', Label found: {label}")
                        if allowed_labels is None or label in allowed_labels:
                            print(f'Adding {word} with label {label} to CSV data')
                            data_for_csv.append(list(vector) + [label])
                    else:
                        print(f"Label not found for '{word}'")

    # Create a DataFrame without column headers
    df = pd.DataFrame(data_for_csv)
    csv_file_name = 'output.csv'
    df.to_csv(csv_file_name, index=False, header=False)
    print(f"Data saved to {csv_file_name}")

    return df

# Example usage:

process_file('/content/capt.txt')

## Get caption and image

In [None]:
import pandas as pd

def process_file(file_path, allowed_labels=[1,2,3,4,5]):
    data_for_csv = []
    current_image_name = ''  # Initialize the current image name
    all_captions = []  # Initialize a list to keep track of all captions

    with open(file_path, 'r') as file:
        for sentence in file:
            sentence = sentence.strip()  # Remove leading/trailing whitespaces
            if sentence:  # Check if the sentence is not empty
                # Check if the line is an image name using a simple pattern match
                if sentence.endswith('.jpg'):
                    current_image_name = sentence  # Update the current image name
                    continue  # Skip further processing for this line

                print("Processing sentence:", sentence)
                visual_words = do_spacy(sentence)
                print("Extracted visual words:", visual_words)

                for word in visual_words:
                    vector = get_vector(word)
                    label = next((key for key, value in mydic.items() if word.lower() in [v.lower() for v in value]), None)
                    if label is not None:
                        print(f"Word: '{word}', Label found: {label}")
                        if allowed_labels is None or label in allowed_labels:
                            print(f'Adding {word} with label {label} to CSV data')
                            # Add the current image name and the original caption to each row
                            data_for_csv.append(list(vector) + [label, current_image_name, sentence])
                    else:
                        print(f"Label not found for '{word}'")
                all_captions.append(sentence)  # Add the sentence to all_captions list

    # Create a DataFrame with specified column headers
    # Assuming the vector has a fixed length, adjust column names accordingly
    num_vector_elements = len(data_for_csv[0]) - 3  # Subtract label, image name, and caption columns
    vector_column_names = [f'Feature_{i+1}' for i in range(num_vector_elements)]
    df = pd.DataFrame(data_for_csv, columns=vector_column_names + ['Label', 'ImageName', 'Caption'])

    csv_file_name = 'output.csv'
    df.to_csv(csv_file_name, index=False)
    print(f"Data saved to {csv_file_name}")

    return df, all_captions

# Note: You will need to implement or adjust the `do_spacy`, `get_vector`, and `mydic` parts according to your specific requirements.


In [None]:
process_file('/content/image_captions_output.txt')

In [None]:
# Test the function
test_sentence = "I have a happy dog and a blue car"
result = process_sentences(test_sentence)
print("Result:", result)

### PCA

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Load the data from CSV
csv_file_name = 'output.csv'
data = pd.read_csv(csv_file_name, header=None)

# Separate features and labels
features = data.iloc[:, :-1]  # All columns except the last one
labels = data.iloc[:, -1]    # Only the last column

# Standardize the features
scaler = StandardScaler()
features_std = scaler.fit_transform(features)

# Apply PCA to reduce dimensions to 10
pca = PCA(n_components=10)
reduced_features = pca.fit_transform(features_std)

# Recombine reduced features with labels
reduced_data = pd.DataFrame(reduced_features)
reduced_data['label'] = labels

# Save the reduced data to a new CSV file
reduced_csv_file_name = 'reduced_output_New.csv'
reduced_data.to_csv(reduced_csv_file_name, index=False, header=False)
print(f"Reduced data saved to {reduced_csv_file_name}")


## PCA to 10

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Load the data from CSV
csv_file_name = 'output.csv'
data = pd.read_csv(csv_file_name)

# Assuming the first column is not a feature but an identifier or index,
# and the actual features are from columns 1 to 300 (Python uses 0-based indexing)
# Separate features (columns 1 to 300) and labels (last two columns)
features = data.iloc[:, 1:301]  # Select columns for features
labels = data.iloc[:, -2:]      # Select the last two columns for labels (ImageName, Caption)

# Standardize the features
scaler = StandardScaler()
features_std = scaler.fit_transform(features)

# Apply PCA to reduce dimensions to 10
pca = PCA(n_components=10)
reduced_features = pca.fit_transform(features_std)

# Recombine reduced features with labels
reduced_data = pd.DataFrame(reduced_features)
# Concatenate reduced features with labels horizontally
reduced_data = pd.concat([reduced_data, labels.reset_index(drop=True)], axis=1)

# Save the reduced data to a new CSV file
reduced_csv_file_name = 'reduced_output_New.csv'
reduced_data.to_csv(reduced_csv_file_name, index=False)
print(f"Reduced data saved to {reduced_csv_file_name}")