<a href="https://colab.research.google.com/github/roberthouston14/Feature_Extractors/blob/main/Traditional_Feature_Extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
import string
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def main(csv_file_path):
    # Read the CSV file
    data = pd.read_csv(csv_file_path)

    # Randomly sample 25 rows from the DataFrame
    data = data.sample(n=25, random_state=42)


    # Compute text-based features
    data['char_count'] = data['text'].apply(len)
    data['word_count'] = data['text'].apply(lambda x: len(x.split()))
    data['avg_word_length'] = data['char_count'] / data['word_count']
    data['stopword_count'] = data['text'].apply(lambda x: len([w for w in x.lower().split() if w in stop_words]))
    data['punctuation_count'] = data['text'].apply(lambda x: len([c for c in x if c in string.punctuation]))
    data['uppercase_count'] = data['text'].apply(lambda x: len([w for w in x.split() if w.isupper()]))
    data['digit_count'] = data['text'].apply(lambda x: len([c for c in x if c.isdigit()]))

    # Compute TF-IDF vectors
    tfidf_vectorizer = TfidfVectorizer(max_features=1000)
    tfidf_matrix = tfidf_vectorizer.fit_transform(data['text'])
    tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_feature_names)


    # Compute n-gram features (bigrams as an example)
    bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=1000)
    bigram_matrix = bigram_vectorizer.fit_transform(data['text'])
    bigram_feature_names = bigram_vectorizer.get_feature_names_out()
    bigram_df = pd.DataFrame(bigram_matrix.toarray(), columns=bigram_feature_names)


    # Concatenate the original dataset with TF-IDF and n-gram feature DataFrames
    data_extended = pd.concat([data, tfidf_df, bigram_df], axis=1)

    # Save the updated DataFrame to a new CSV file
    output_file_path = os.path.splitext(csv_file_path)[0] + '_features.csv'
    data_extended.to_csv(output_file_path, index=False)

if __name__ == "__main__":
    csv_file_path = input("Please enter the path to the CSV file: ")
    main(csv_file_path)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Please enter the path to the CSV file: /content/drive/MyDrive/Production Datasets/Working_Gold_Data_POS_Features_Spellcheck_Features.csv


In [5]:
import os
import string
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords

def main(csv_file_path):
    # Read the CSV file
    data = pd.read_csv(csv_file_path)

    # Randomly sample 25 rows from the DataFrame
    data = data.sample(n=25, random_state=42)

    # Reset the index of the input dataset
    data.reset_index(drop=True, inplace=True)

    # Compute text-based features
    stop_words = set(stopwords.words("english"))
    
    data['char_count'] = data['text'].apply(len)
    data['word_count'] = data['text'].apply(lambda x: len(x.split()))
    data['avg_word_length'] = data['char_count'] / data['word_count']
    data['stopword_count'] = data['text'].apply(lambda x: len([w for w in x.lower().split() if w in stop_words]))
    data['punctuation_count'] = data['text'].apply(lambda x: len([c for c in x if c in string.punctuation]))
    data['uppercase_count'] = data['text'].apply(lambda x: len([w for w in x.split() if w.isupper()]))
    data['digit_count'] = data['text'].apply(lambda x: len([c for c in x if c.isdigit()]))

    # Compute TF-IDF features
    tfidf_vectorizer = TfidfVectorizer(max_features=1000)
    tfidf_matrix = tfidf_vectorizer.fit_transform(data['text'])
    tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_feature_names)
    tfidf_df.columns = ['tfidf_' + col for col in tfidf_df.columns]

    # Compute bigram features
    bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=1000)
    bigram_matrix = bigram_vectorizer.fit_transform(data['text'])
    bigram_feature_names = bigram_vectorizer.get_feature_names_out()
    bigram_df = pd.DataFrame(bigram_matrix.toarray(), columns=bigram_feature_names)
    bigram_df.columns = ['bigram_' + col for col in bigram_df.columns]

    # Concatenate the input dataset and the new feature DataFrames
    data = pd.concat([data, tfidf_df, bigram_df], axis=1)

    # Save the updated DataFrame to a new CSV file
    output_file_path = os.path.splitext(csv_file_path)[0] + '_features.csv'
    data.to_csv(output_file_path, index=False)

if __name__ == "__main__":
    csv_file_path = input("Please enter the path to the CSV file: ")
    main(csv_file_path)


Please enter the path to the CSV file: /content/drive/MyDrive/Production Datasets/Working_Gold_Data_POS_Features_Spellcheck_Features.csv
