In [31]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [32]:
import xml.etree.ElementTree as ET
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

# Parse XML file
tree = ET.parse('/content/gdrive/MyDrive/Colab Notebooks/Posts.xml')
root = tree.getroot()

# Create a dictionary to store question data
questions = {
    'id': [],
    'title': [],
    'body': [],
    'score': [],
    'answer_count': []
}

# Extract relevant data from XML
for child in root:
    if child.attrib['PostTypeId'] == '1':  # Check if post is a question
        questions['id'].append(child.attrib['Id'])
        questions['title'].append(child.attrib['Title'])
        questions['body'].append(child.attrib['Body'])
        questions['score'].append(int(child.attrib['Score']))
        questions['answer_count'].append(int(child.attrib['AnswerCount']))

# Label the questions
labels = []
for i in range(len(questions['id'])):
    if questions['score'][i] > 5 and questions['answer_count'][i] > 0:
        labels.append('Good-Quality')
    elif 0 <= questions['score'][i] <= 5 and questions['answer_count'][i] == 0:
        labels.append('Low-Quality')
    else:
        labels.append('Very-Low-Quality')

# Create a dataframe with features and labels
df = pd.DataFrame({
    'id': questions['id'],
    'title': questions['title'],
    'body': questions['body'],
    'score': questions['score'],
    'answer_count': questions['answer_count'],
    'label': labels
})

# Extract features using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['title'] + ' ' + df['body'])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.2, random_state=42)

# Train logistic regression model
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Make predictions on test set
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)


Accuracy: 0.738843630596208


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [33]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Function to parse the XML file and extract required data
def parse_XML(xml_file):
    tree = ET.parse('/content/gdrive/MyDrive/Colab Notebooks/Posts.xml')
    root = tree.getroot()
    data = []
    for post in root.findall('row'):
        try:
            post_id = int(post.get('Id'))
            view_count = int(post.get('ViewCount'))
            body_length = len(post.get('Body'))
            title_length = len(post.get('Title'))
            score = int(post.get('Score'))
            answer_count = int(post.get('AnswerCount'))
            comment_count = int(post.get('CommentCount'))
            if score > 5 and answer_count > 0:
                quality = 'Good'
            elif score >= 0 and answer_count == 0:
                quality = 'Low'
            else:
                quality = 'Very Low'
            data.append([post_id, view_count, body_length, title_length, score, answer_count, comment_count, quality])
        except:
            pass
    return pd.DataFrame(data, columns=['ID', 'ViewCount', 'BodyLength', 'TitleLength', 'Score', 'AnswerCount', 'CommentCount', 'Quality'])

# Function to create additional features
def create_features(df):
    df['TitleLength2'] = df['TitleLength']**2
    df['BodyLength2'] = df['BodyLength']**2
    df['TitleLengthLog'] = np.log(df['TitleLength'])
    df['BodyLengthLog'] = np.log(df['BodyLength'])
    df['ViewCountLog'] = np.log(df['ViewCount'] + 1)
    df['AnswerCountLog'] = np.log(df['AnswerCount'] + 1)
    df['CommentCountLog'] = np.log(df['CommentCount'] + 1)
    return df

# Parse the XML file and create the dataset
df = parse_XML('Posts.xml')

# Create additional features
df = create_features(df)

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(['ID', 'Quality'], axis=1), df['Quality'], test_size=0.2, random_state=42)

# Train a logistic regression model
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)

# Evaluate the model on test set
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[  522     1  2611]
 [    3     6  4458]
 [  523    10 19608]]
Accuracy: 0.7258308701607671


In [34]:
import xml.etree.ElementTree as ET
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import re
# function to extract features from the XML file
def extract_features(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    features = []
    for child in root:
        score = int(child.get('Score'))
        answer_count = (child.get('AnswerCount'))
        title = child.get('Title')
        body = child.get('Body')
        tags = child.get('Tags')
        view_count = (child.get('ViewCount'))
        comment_count = int(child.get('CommentCount'))
        if answer_count is not None:
          if score > 5 and int(answer_count) > 0:
              quality = 'good'
          elif score >= 0 and int(answer_count) == 0:
              quality = 'low'
          else:
              quality = 'very-low'
        features.append({'title': title, 'body': body, 'tags': tags, 'view_count': view_count,
                         'comment_count': comment_count, 'quality': quality})
    return features

# function to create a dataframe from the extracted features
def create_dataframe(features):
    df = pd.DataFrame(features, columns=['title', 'body', 'tags', 'view_count', 'comment_count', 'quality'])
    df = df.dropna()
    df['text'] = df['title'] + ' ' + df['body'] + ' ' + df['tags']
    return df

# function to preprocess the text data
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

# extract features from the XML file
features = extract_features('/content/gdrive/MyDrive/Colab Notebooks/Posts.xml')

# create a dataframe from the extracted features
df = create_dataframe(features)

# preprocess the text data
df['text'] = df['text'].apply(preprocess_text)

# create the feature matrix
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['text'])

# create the label vector
y = df['quality']

# train the Multinomial-Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X, y)

# predict the labels for the test data
y_pred = clf.predict(X)

# calculate the accuracy of the classifier
accuracy = accuracy_score(y, y_pred)

print('Accuracy:', accuracy)


Accuracy: 0.7970441929204816


In [35]:
import xml.etree.ElementTree as ET
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Parse the XML file and create a DataFrame of features
tree = ET.parse('/content/gdrive/MyDrive/Colab Notebooks/Posts.xml')
root = tree.getroot()
features = []
for child in root:
    if child.attrib['PostTypeId'] == '1':  # Only consider questions
        id = child.attrib['Id']
        viewcount = int(child.attrib['ViewCount']) if 'ViewCount' in child.attrib else 0
        score = int(child.attrib['Score']) if 'Score' in child.attrib else 0
        answers = int(child.attrib['AnswerCount']) if 'AnswerCount' in child.attrib else 0
        title = child.attrib['Title'] if 'Title' in child.attrib else ''
        body = child.attrib['Body'] if 'Body' in child.attrib else ''
        comments = int(child.attrib['CommentCount']) if 'CommentCount' in child.attrib else 0
        features.append({'ID': id, 'ViewCount': viewcount, 'Score': score, 'Answers': answers,
                         'Title': len(title), 'Body': len(body), 'Comments': comments})

df = pd.DataFrame(features)

# Label the data into three categories
def label_quality(row):
    if row['Score'] > 5 and row['Answers'] > 0:
        return 'Good'
    elif 0 <= row['Score'] <= 5 and row['Answers'] == 0:
        return 'Low'
    elif row['Score'] < 0:
        return 'Very Low'
    else:
        return 'Unknown'
df['Quality'] = df.apply(label_quality, axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[['ViewCount', 'Answers', 'Title', 'Body', 'Comments']], df['Quality'], test_size=0.2, random_state=42)

# Train a random forest classifier and make predictions
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.84
