Text Classification

In [2]:
import os
folder_path = "/content/drive/MyDrive/AI Training"
for file_name in os.listdir(folder_path):
    print(os.path.join(folder_path, file_name))

/content/drive/MyDrive/AI Training/Daily Climate Data.csv
/content/drive/MyDrive/AI Training/Text_Files.csv
/content/drive/MyDrive/AI Training/Daily Climate Data.xlsx
/content/drive/MyDrive/AI Training/Audio Folder
/content/drive/MyDrive/AI Training/Image Folder
/content/drive/MyDrive/AI Training/Text Files
/content/drive/MyDrive/AI Training/combined_texts.csv


In [4]:
root_path = "/content/drive/MyDrive/AI Training"
for folder_path, subfolders, files in os.walk(root_path):
    for file_name in files:
        if file_name.endswith(".txt"):
            full_path = os.path.join(folder_path, file_name)
            print(full_path)

/content/drive/MyDrive/AI Training/Text Files/Intro.txt
/content/drive/MyDrive/AI Training/Text Files/intro-1.txt
/content/drive/MyDrive/AI Training/Text Files/intro-1 - Copy.txt
/content/drive/MyDrive/AI Training/Text Files/intro-1 - Copy (2).txt


In [6]:
import os
import pandas as pd

root_path = "/content/drive/MyDrive/AI Training"
txt_file_paths = []

# Collect all .txt file paths
for folder_path, subfolders, files in os.walk(root_path):
    for file_name in files:
        if file_name.endswith(".txt"):
            full_path = os.path.join(folder_path, file_name)
            txt_file_paths.append(full_path)

data = []

print(txt_file_paths)
for file_path in txt_file_paths:
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    filename = os.path.basename(file_path)
    data.append({'Content_from_txt': content})

# Create a DataFrame and save to CSV
df = pd.DataFrame(data)
csv_output_path = os.path.join(root_path, 'combined_texts.csv')
df.to_csv(csv_output_path, index=False)

print(f"CSV saved to: {csv_output_path}")

['/content/drive/MyDrive/AI Training/Text Files/Intro.txt', '/content/drive/MyDrive/AI Training/Text Files/intro-1.txt', '/content/drive/MyDrive/AI Training/Text Files/intro-1 - Copy.txt', '/content/drive/MyDrive/AI Training/Text Files/intro-1 - Copy (2).txt']
CSV saved to: /content/drive/MyDrive/AI Training/combined_texts.csv


In [7]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

df = pd.read_csv("/content/drive/MyDrive/AI Training/combined_texts.csv")

stop_words = set(stopwords.words('english'))

# Preprocessing: Tokenization + Stopword Removal
def preprocess(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return " ".join(filtered_tokens)

df['cleaned'] = df['Content_from_txt'].astype(str).apply(preprocess)

# Bag of Words using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['cleaned'])

bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
final_df = pd.concat([df, bow_df], axis=1)
final_df.to_csv("/content/drive/MyDrive/AI Training/combined_texts.csv", index=False)

print("Saved updated CSV with BoW features.")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Saved updated CSV with BoW features.


In [8]:
df = pd.read_csv("/content/drive/MyDrive/AI Training/combined_texts.csv")
df.head()

Unnamed: 0,Content_from_txt,cleaned,courses,deep,dinesh,hi,hope,introduction,learning,machine,rbg,well
0,"Hi, Hope you are doing well.\n",hi hope well,0,0,0,1,1,0,0,0,0,1
1,Hi this is Dinesh From RBG,hi dinesh rbg,0,0,1,1,0,0,0,0,1,0
2,This is Courses of Deep Learning,courses deep learning,1,1,0,0,0,0,1,0,0,0
3,This an Introduction of Machine Learning,introduction machine learning,0,0,0,0,0,1,1,1,0,0


IMAGE TO PIXEL

In [9]:
!pip install numpy pandas pillow tqdm



In [10]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm

In [11]:
def images_to_csv(folder_path, output_csv):
    data = []

    # Sort subfolders to keep label order consistent
    for label in tqdm(sorted(os.listdir(folder_path))):
        label_path = os.path.join(folder_path, label)
        if not os.path.isdir(label_path):
            continue

        # Process only image files with valid extensions
        for image_name in os.listdir(label_path):
            if not image_name.lower().endswith(('.png', '.jpg', '.jpeg')):
                continue

            image_path = os.path.join(label_path, image_name)
            try:
                with Image.open(image_path).convert("L") as img:
                    pixels = np.array(img).reshape(-1)
                    row = [int(label)] + pixels.tolist()
                    data.append(row)
            except Exception as e:
                print(f"Skipping {image_path}, error: {e}")

    # Prepare column names: label + pixels
    pixel_columns = [f'pixel{i}' for i in range(len(data[0]) - 1)]
    df = pd.DataFrame(data, columns=['label'] + pixel_columns)
    df.to_csv(output_csv, index=False)
    print(f"Saved CSV file to: {output_csv}")



In [14]:
images_to_csv("/content/drive/MyDrive/MNIST/test", "mnist_train.csv")


100%|██████████| 2/2 [01:03<00:00, 31.63s/it]


Saved CSV file to: mnist_train.csv


SPEECH TO TEXT

In [None]:
!pip install openai-whisper SpeechRecognition pydub

Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/800.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m798.7/800.5 kB[0m [31m30.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting SpeechRecognition
  Downloading speechrecognition-3.14.3-py3-none-any.whl.metadata (30 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_run

In [None]:
import os
import pandas as pd
import whisper
from tqdm import tqdm

In [None]:
def speech_dataset_to_csv(folder_path, output_csv):
    model = whisper.load_model("base")
    data = []

    for emotion_label in tqdm(sorted(os.listdir(folder_path))):
        emotion_path = os.path.join(folder_path, emotion_label)
        if not os.path.isdir(emotion_path):
            continue

        for audio_file in os.listdir(emotion_path):
            if not audio_file.lower().endswith(".wav"):
                continue

            audio_path = os.path.join(emotion_path, audio_file)
            try:
                result = model.transcribe(audio_path)
                transcription = result["text"].strip()
                data.append([audio_path, transcription, emotion_label])
            except Exception as e:
                print(f"Error processing {audio_path}: {e}")

    df = pd.DataFrame(data, columns=["audio_path", "transcription", "emotion_label"])
    df.to_csv(output_csv, index=False)
    print(f"Saved CSV to: {output_csv}")


In [None]:
speech_dataset_to_csv("/content/drive/MyDrive/new_speech_data", "speech_emotion_dataset.csv")