In [None]:
##BASE##
import os
import xml.etree.ElementTree as ET
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.metrics import confusion_matrix, classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
classified_count = 0
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

def extract_text_by_subject_id(folder_path):
    global classified_count
    data = []
    subject_dict = {}

    for filename in os.listdir(folder_path):
        if filename.endswith('.xml'):
            file_path = os.path.join(folder_path, filename)
            try:
                tree = ET.parse(file_path)
                root = tree.getroot()

                if root.tag == 'INDIVIDUAL':
                    subject_elem = root.find('./ID')
                    subject_id = subject_elem.text.strip() if subject_elem is not None and subject_elem.text else "Missing Subject ID"

                    for writing in root.findall('.//WRITING'):
                        text_elem = writing.find('TEXT')

                        if text_elem is not None and text_elem.text:
                            text = preprocess_text(text_elem.text.strip())

                            if subject_id not in subject_dict:
                                subject_dict[subject_id] = []
                            subject_dict[subject_id].append(text)


            except ET.ParseError as e:
                print(f"Error parsing {filename}: {e}")
            except Exception as e:
                print(f"An error occurred with {filename}: {e}")

    for subject_id, texts in subject_dict.items():
        term_freq = Counter()
        combined_texts = ' | '.join(texts)
        combined_texts = preprocess_text(combined_texts)
        term_freq.update(combined_texts.split())
        text_by_freq = term_freq.most_common()

        term_freq_str = ", ".join([f"{term}:{freq}" for term, freq in text_by_freq])
        data.append({'ID': subject_id, 'TEXT': combined_texts, 'Term-Frequency ': term_freq_str})

    df = pd.DataFrame(data)
    return pd.DataFrame(data)
def plot_sentiment_distribution(df):
    sns.countplot(x='Predicted Sentiment', data=df)
    plt.title("Sentiment Distribution")
    plt.xlabel("Sentiment")
    plt.ylabel("Count")
    plt.show()
def plot_wordcloud(df, sentiment_value, sentiment_label):
    sentiment_texts = ' '.join(df[df['Predicted Sentiment'] == sentiment_value]['TEXT'].dropna())
    if not sentiment_texts.strip():
        print(f"No text entries found for {sentiment_label} sentiment.")
        return

    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(sentiment_texts)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(f"Word Cloud for {sentiment_label.capitalize()} Sentiments")
    plt.show()

def load_ground_truth(ground_truth_file):
    ground_truth = {}
    with open(ground_truth_file, 'r') as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) == 2:
                subject_id, label = parts
                ground_truth[subject_id] = int(label)
    return ground_truth

def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    conf = confusion_matrix(y_true, y_pred)

    print(f"Confusion Matrix: {conf.tolist()}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")

def derive_pos_tags(df):
    pos_data = []
    for index, row in df.iterrows():
        words = row['TEXT'].split()
        pos_tags = nltk.pos_tag(words)
        pos_data.extend([(word, tag) for word, tag in pos_tags])
    pos_df = pd.DataFrame(pos_data, columns=['Word', 'POS Tag'])
    pos_df.to_csv('pathtofile', index=False)
    print("POS tags saved to pos_tags_base.csv")

def save_predictions_to_csv(test_df, predictions, output_file):
    test_df['Predicted Sentiment'] = predictions
    test_df[['ID', 'TEXT', 'Predicted Sentiment', 'Term-Frequency ']].to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")

folder_path = 'pathtofolder'
ground_truth_file = 'pathtofolder'


df = extract_text_by_subject_id(folder_path)

ground_truth = load_ground_truth(ground_truth_file)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_texts = train_df['TEXT'].tolist()
train_labels = [ground_truth[id_] for id_ in train_df['ID']]
test_texts = test_df['TEXT'].tolist()
test_labels = [ground_truth[id_] for id_ in test_df['ID']]


# Classifier setup
vectorizer = TfidfVectorizer(max_features=1000)
svm_model = make_pipeline(StandardScaler(with_mean=False), SVC(kernel='linear'))

train_vectors = vectorizer.fit_transform(train_texts)
test_vectors = vectorizer.transform(test_texts)

svm_model.fit(train_vectors, train_labels)

test_predictions = svm_model.predict(test_vectors)

print("\nEvaluation on Testing Set:")
evaluate_model(test_labels, test_predictions)

# Save predictions
output_file = 'pathtofile'
save_predictions_to_csv(test_df, test_predictions, output_file)
plot_wordcloud(test_df, sentiment_value=1, sentiment_label='positive')
plot_wordcloud(test_df, sentiment_value=0, sentiment_label='negative')
plot_sentiment_distribution(df)

print(classified_count)

In [None]:
#HAIR AND SKIN##
import os
import xml.etree.ElementTree as ET
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import make_pipeline
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

keywords = [
    "skin", "ugly", "hair", "hate my face", "pimples", "dark circles",
    "hyperpigmentation","acne scars", "pick my skin", "wrinkles ", "aging",
    "my confidence", "balding", "hair loss", "hair pulling", "acne",
    "hair transplants ","going bald" , "perfect hair", "dandruff", "dark spots"
    "picking skin", "gone bald"
]
classified_count = 0
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

def extract_text_by_subject_id(folder_path):
    global classified_count
    data = []
    subject_dict = {}

    for filename in os.listdir(folder_path):
        if filename.endswith('.xml'):
            file_path = os.path.join(folder_path, filename)
            try:
                tree = ET.parse(file_path)
                root = tree.getroot()

                if root.tag == 'INDIVIDUAL':
                    subject_elem = root.find('./ID')
                    subject_id = subject_elem.text.strip() if subject_elem is not None and subject_elem.text else "Missing Subject ID"

                    for writing in root.findall('.//WRITING'):
                        text_elem = writing.find('TEXT')

                        if text_elem is not None and text_elem.text:
                            text = text_elem.text.strip()

                            if any(keyword in text.lower() for keyword in keywords):
                                if subject_id not in subject_dict:
                                    subject_dict[subject_id] = []
                                subject_dict[subject_id].append(text)
                                classified_count += 1

            except ET.ParseError as e:
                print(f"Error parsing {filename}: {e}")
            except Exception as e:
                print(f"An error occurred with {filename}: {e}")

    for subject_id, texts in subject_dict.items():
        term_freq = Counter()
        combined_texts = ' | '.join(texts)
        combined_texts = preprocess_text(combined_texts)
        term_freq.update(combined_texts.split())
        text_by_freq = term_freq.most_common()
        term_freq_str = ", ".join([f"{term}:{freq}" for term, freq in text_by_freq])
        data.append({'ID': subject_id, 'TEXT': combined_texts, 'Term-Frequency ': term_freq_str})

    df = pd.DataFrame(data)
    return df

def plot_sentiment_distribution(df):
    sns.countplot(x='Predicted Sentiment', data=df)
    plt.title("Sentiment Distribution")
    plt.xlabel("Sentiment")
    plt.ylabel("Count")
    plt.show()
def plot_wordcloud(df, sentiment_value, sentiment_label):
    sentiment_texts = ' '.join(df[df['Predicted Sentiment'] == sentiment_value]['TEXT'].dropna())
    if not sentiment_texts.strip():
        print(f"No text entries found for {sentiment_label} sentiment.")
        return
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(sentiment_texts)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(f"Word Cloud for {sentiment_label.capitalize()} Sentiments")
    plt.show()

def load_ground_truth(ground_truth_file):
    ground_truth = {}
    with open(ground_truth_file, 'r') as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) == 2:
                subject_id, label = parts
                ground_truth[subject_id] = int(label)
    return ground_truth


def evaluate_model(y_true, y_pred):
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

def derive_pos_tags(df):
    pos_data = []
    for index, row in df.iterrows():
        words = row['TEXT'].split()
        pos_tags = nltk.pos_tag(words)
        pos_data.extend([(word, tag) for word, tag in pos_tags])
    pos_df = pd.DataFrame(pos_data, columns=['Word', 'POS Tag'])
    pos_df.to_csv('pathtofile', index=False)
    print("POS tags saved to pos_tags_hairandskin.csv")

def save_predictions_to_csv(test_df, predictions, output_file):
    test_df['Predicted Sentiment'] = predictions
    test_df[['ID', 'TEXT', 'Predicted Sentiment', 'Term-Frequency ']].to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")

folder_path = 'pathtofile'
ground_truth_file = 'pathtofile'

df = extract_text_by_subject_id(folder_path)

ground_truth = load_ground_truth(ground_truth_file)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_texts = train_df['TEXT'].tolist()
train_labels = [ground_truth[id_] for id_ in train_df['ID']]
test_texts = test_df['TEXT'].tolist()
test_labels = [ground_truth[id_] for id_ in test_df['ID']]


def save_all_sentiments_to_csv(df, model, vectorizer, output_file):
    text_vectors = vectorizer.transform(df['TEXT'])
    predictions = model.predict(text_vectors)
    results_df = df.copy()
    results_df['Predicted Sentiment'] = predictions
    results_df[['ID', 'TEXT', 'Predicted Sentiment']].to_csv(output_file, index=False)
    print(f"Sentiment predictions saved to {output_file}")

# Classifier setup
vectorizer = TfidfVectorizer(max_features=1000)
svm_model = make_pipeline(StandardScaler(with_mean=False), SVC(kernel='linear'))

train_vectors = vectorizer.fit_transform(train_texts)
test_vectors = vectorizer.transform(test_texts)

svm_model.fit(train_vectors, train_labels)

test_predictions = svm_model.predict(test_vectors)

print("\nEvaluation on Testing Set:")
evaluate_model(test_labels, test_predictions)

# Save predictions
output_file = 'pathtofile'
save_predictions_to_csv(test_df, test_predictions, output_file)

df['Predicted Sentiment'] = svm_model.predict(vectorizer.transform(df['TEXT']))

plot_wordcloud(test_df, sentiment_value=1, sentiment_label='positive')
plot_wordcloud(test_df, sentiment_value=0, sentiment_label='negative')
plot_sentiment_distribution(df)

print(f"Classified count: {classified_count}")

In [None]:
##WEIGHT##
import os
import xml.etree.ElementTree as ET
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.metrics import confusion_matrix, classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

keywords = [
    "weight loss", "weight", "look ugly",
    "abnormal body", "overweight", "body dysmorphic disorder",
    "hate my body", "hated my body", "underweight",
    "too skinny", "too fat", "weight gain"
]
classified_count = 0
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)
classified_count = []
def extract_text_by_subject_id(folder_path):
    global classified_count
    data = []
    subject_dict = {}

    for filename in os.listdir(folder_path):
        if filename.endswith('.xml'):
            file_path = os.path.join(folder_path, filename)
            try:
                tree = ET.parse(file_path)
                root = tree.getroot()

                if root.tag == 'INDIVIDUAL':
                    subject_elem = root.find('./ID')
                    subject_id = subject_elem.text.strip() if subject_elem is not None and subject_elem.text else "Missing Subject ID"

                    for writing in root.findall('.//WRITING'):
                        text_elem = writing.find('TEXT')

                        if text_elem is not None and text_elem.text:
                            text = text_elem.text.strip()

                            if any(keyword in text.lower() for keyword in keywords):
                                if subject_id not in subject_dict:
                                    subject_dict[subject_id] = []
                                subject_dict[subject_id].append(text)
                                classified_count += 1

            except ET.ParseError as e:
                print(f"Error parsig {filename}: {e}")
            except Exception as e:
                print(f"An error occurred with {filename}: {e}")

    for subject_id, texts in subject_dict.items():
        term_freq = Counter()
        combined_texts = ' | '.join(texts)
        combined_texts = preprocess_text(combined_texts)
        term_freq.update(combined_texts.split())
        text_by_freq = term_freq.most_common()
        term_freq_str = ", ".join([f"{term}:{freq}" for term, freq in text_by_freq])
        data.append({'ID': subject_id, 'TEXT': combined_texts, 'Term-Frequency': term_freq_str})

    df = pd.DataFrame(data)
    return df

def plot_sentiment_distribution(df):
    sns.countplot(x='Predicted Sentiment', data=df)
    plt.title("Sentiment Distribution")
    plt.xlabel("Sentiment")
    plt.ylabel("Count")
    plt.show()
def plot_wordcloud(df, sentiment_value, sentiment_label):
    sentiment_texts = ' '.join(df[df['Predicted Sentiment'] == sentiment_value]['TEXT'].dropna())

    if not sentiment_texts.strip():
        print(f"No text entries found for {sentiment_label} sentiment.")
        return

    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(sentiment_texts)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(f"Word Cloud for {sentiment_label.capitalize()} Sentiments")
    plt.show()

def load_ground_truth(ground_truth_file):
    ground_truth = {}
    with open(ground_truth_file, 'r') as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) == 2:
                subject_id, label = parts
                ground_truth[subject_id] = int(label)
    return ground_truth


def evaluate_model(y_true, y_pred):
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

def derive_pos_tags(df):
    pos_data = []
    for index, row in df.iterrows():
        words = row['TEXT'].split()
        pos_tags = nltk.pos_tag(words)
        pos_data.extend([(word, tag) for word, tag in pos_tags])
    pos_df = pd.DataFrame(pos_data, columns=['Word', 'POS Tag'])
    pos_df.to_csv('pathtofile', index=False)
    print("POS tags saved to pos_tags_weight.csv")

def save_predictions_to_csv(test_df, predictions, output_file):
    test_df['Predicted Sentiment'] = predictions
    test_df[['ID', 'TEXT', 'Predicted Sentiment', 'Term-Frequency']].to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")

folder_path = 'pathtofile'
ground_truth_file = 'pathtofile'

all_xml_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.xml')]

#skipped_files = set(all_xml_files) - set(processed_files)

df = extract_text_by_subject_id(folder_path)

ground_truth = load_ground_truth(ground_truth_file)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_texts = train_df['TEXT'].tolist()
train_labels = [ground_truth[id_] for id_ in train_df['ID']]
test_texts = test_df['TEXT'].tolist()
test_labels = [ground_truth[id_] for id_ in test_df['ID']]

def save_all_sentiments_to_csv(df, model, vectorizer, output_file):
    text_vectors = vectorizer.transform(df['TEXT'])
    predictions = model.predict(text_vectors)
    results_df = df.copy()
    results_df['Predicted Sentiment'] = predictions
    results_df[['ID', 'TEXT', 'Predicted Sentiment']].to_csv(output_file, index=False)
    print(f"Sentiment predictions saved to {output_file}")

# Classifier setup
vectorizer = TfidfVectorizer(max_features=1000)
svm_model = make_pipeline(StandardScaler(with_mean=False), SVC(kernel='linear'))

train_vectors = vectorizer.fit_transform(train_texts)
test_vectors = vectorizer.transform(test_texts)

svm_model.fit(train_vectors, train_labels)

test_predictions = svm_model.predict(test_vectors)

print("\nEvaluation on Testing Set:")
evaluate_model(test_labels, test_predictions)

# Save predictions
output_file = 'pathtofile'
save_predictions_to_csv(test_df, test_predictions, output_file)

df['Predicted Sentiment'] = svm_model.predict(vectorizer.transform(df['TEXT']))

plot_wordcloud(test_df, sentiment_value=1, sentiment_label='positive')
plot_wordcloud(test_df, sentiment_value=0, sentiment_label='negative')
plot_sentiment_distribution(df)

print(f"Classified count: {classified_count}")
