In [87]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib



In [108]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

def spacy_summarizer(text, num_tokens=3):
    # Load the spaCy English model
    nlp = spacy.load("en_core_web_sm")

    # Process the text using spaCy
    doc = nlp(text)

    # Extract noun phrases and remove stopwords
    noun_phrases = [
        chunk.text.replace(" ", "_") 
        for chunk in doc.noun_chunks 
        if chunk.text.lower() not in STOP_WORDS
    ]

    # If there are noun phrases, use them as summary
    if noun_phrases:
        summary = "_".join(noun_phrases[:num_tokens])
    else:
        # If no noun phrases, use the first few tokens
        summary = "_".join([token.text for token in doc][:num_tokens])

    return summary.lower()



In [109]:
# Load the dataset
df = pd.read_csv('../datasets/fitness_survey.csv')
print(df.head(2))

                         Timestamp Your name  Your gender  Your age   \
0  2019/07/03 11:48:07 PM GMT+5:30    Parkavi       Female  19 to 25   
1  2019/07/03 11:51:22 PM GMT+5:30   Nithilaa       Female  19 to 25   

   How important is exercise to you ?  \
0                                   2   
1                                   4   

  How do you describe your current level of fitness ?  \
0                                               Good    
1                                          Very good    

  How often do you exercise?  \
0                      Never   
1                      Never   

  What barriers, if any, prevent you from exercising more regularly?           (Please select all that apply)  \
0    I don't have enough time;I can't stay motivated                                                            
1     I don't have enough time;I'll become too tired                                                            

  What form(s) of exercise do you currently partic

In [110]:

def generate_column_mapping(columns):
    """
    Generate a mapping of old column names to summarized new column names.

    Parameters:
    - columns (list): List of old column names.

    Returns:
    - dict: Mapping of old column names to summarized new column names.
    """
    column_mapping = {column: spacy_summarizer(column) for column in columns}
    return column_mapping



In [112]:
# Extract the column names
df_columns = df.columns.tolist()

# Generate column mapping with summarized names
column_mapping_result = generate_column_mapping(df_columns)

print(column_mapping_result)

# # Replace the DataFrame columns with the summarized versions
df.rename(columns=column_mapping_result, inplace=True)

# # Print the DataFrame with updated column names
print(df.head(2))

{'Timestamp': 'timestamp', 'Your name ': 'your_name', 'Your gender ': 'your_gender', 'Your age ': 'your_age', 'How important is exercise to you ?': 'exercise', 'How do you describe your current level of fitness ?': 'your_current_level_fitness', 'How often do you exercise?': 'how_often_do', 'What barriers, if any, prevent you from exercising more regularly?           (Please select all that apply)': 'what_barriers', 'What form(s) of exercise do you currently participate in ?                        (Please select all that apply)': 'what_form(s_exercise', 'Do you exercise ___________ ?': 'do_you_exercise', 'What time if the day do you prefer to exercise?': 'what_time_if', 'How long do you spend exercising per day ?': 'day', 'Would you say you eat a healthy balanced diet ?': 'a_healthy_balanced_diet', 'What prevents you from eating a healthy balanced diet, If any?                         (Please select all that apply)': 'a_healthy_balanced_diet', 'How healthy do you consider yourself?': 'h

In [113]:
# List of columns to remove
columns_to_remove = list(set(df.columns) - set(column_mapping_result.values()))

# Remove unnecessary columns from the DataFrame
df = df.drop(columns=columns_to_remove)

# Display the updated DataFrame
print(df.head(2))


                         timestamp your_name your_gender  your_age  exercise  \
0  2019/07/03 11:48:07 PM GMT+5:30   Parkavi      Female  19 to 25         2   
1  2019/07/03 11:51:22 PM GMT+5:30  Nithilaa      Female  19 to 25         4   

  your_current_level_fitness how_often_do  \
0                       Good        Never   
1                  Very good        Never   

                                     what_barriers  \
0  I don't have enough time;I can't stay motivated   
1   I don't have enough time;I'll become too tired   

          what_form(s_exercise          do_you_exercise   what_time_if  \
0      I don't really exercise  I don't really exercise  Early morning   
1  Walking or jogging;Swimming             With a group  Early morning   

                       day a_healthy_balanced_diet  \
0  I don't really exercise              Not always   
1  I don't really exercise              Not always   

                             a_healthy_balanced_diet  how_healthy_do  \
0 