In [20]:
!pip install pymongo==4.8.0 pandas==2.2.2 matplotlib==3.9.0 seaborn==0.13.1 wordcloud==1.9.3


In [13]:
import pymongo
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from wordcloud import WordCloud

In [27]:
# Provide the db credentials for your machine; give the db name and collection names after question 5
path = "mongodb://localhost:27017"
client = pymongo.MongoClient(path)
db = client['ubuntu8']
random_collections = ['posts_boot_cluster11', 'posts_apt_cluster1', 'posts_command-line_cluster19', 'posts_drivers_cluster14', 'posts_networking_cluster11']

In [28]:
def describe_data(collection):
    posts = collection.find()
    # convert to df
    df = pd.DataFrame(list(posts))
    df['CreationDate'] = pd.to_datetime(df['CreationDate'])
    df['CreationYear'] = df['CreationDate'].dt.year
    descriptive_stats = df[['ViewCount', 'Score', 'PostTypeId', 'CreationYear']].describe()
    return descriptive_stats

In [29]:
def view_distribution(collection):
    # get posts
    posts = collection.find()
    # convert to df
    df = pd.DataFrame(list(posts))
    # check dates
    df['CreationDate'] = pd.to_datetime(df['CreationDate'])
    # breakdown with other tags
    if isinstance(df['Tags'].iloc[0], str):
        df['Tags'] = df['Tags'].apply(lambda x: x.split(','))
    # take year
    df['CreationYear'] = df['CreationDate'].dt.year
    # take tags
    all_tags = [tag for tags in df['Tags'] for tag in tags]
    tag_counts = pd.Series(all_tags).value_counts().head(10)
    # add four plots
    fig, axes = plt.subplots(1, 4, figsize=(24, 4))
    i = 0
    for col in ['ViewCount', 'Score', 'CreationYear', 'Tags']:
        plt.figure(figsize=(6, 3))
        if col == 'ViewCount' or col == 'Score':
            sns.histplot(df[col], bins=10, ax=axes[i])
        elif col == 'CreationYear':
           sns.countplot(x=col, data=df, ax=axes[i])
        else:
            sns.barplot(x=tag_counts.index, y=tag_counts.values, ax=axes[i])
        axes[i].set_title(f'{col} Distribution')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')
        axes[i].tick_params(axis='x', rotation=45)
        i += 1
    plt.tight_layout()
    plt.show()


In [None]:
for collection in random_collections:
    print(collection)
    collection = db[collection]
    print(describe_data(collection))

In [None]:
# a random cluster for boot
for collection in random_collections:
    print(collection)
    collection = db[collection]
    view_distribution(collection)

In [32]:
def get_word_cloud(collection):
    posts = collection.find()
    # convert to df
    df = pd.DataFrame(list(posts))
    fig, axes = plt.subplots(1, 2, figsize=(16, 4))
    i = 0
    for col in ['Title', 'Body']:
        titles = df[col].dropna().astype(str).tolist()
        text = ' '.join(titles)
        text = ''.join([char.lower() if char.isalpha() else ' ' for char in text])
        wordcloud = WordCloud(width=800, height=400, 
                            background_color='white', 
                            stopwords=None).generate(text)
            
        axes[i].imshow(wordcloud, interpolation='bilinear')
        axes[i].axis('off')
        axes[i].set_title(f'Word Cloud: {col}')
        i += 1
    plt.tight_layout()
    plt.show()


In [None]:

for collection in random_collections:
    print(collection)
    collection = db[collection]
    get_word_cloud(collection)

In [34]:
def views_vs_score(collection):
    posts = collection.find()
    # convert to df
    df = pd.DataFrame(list(posts))

    # Scatter plot: Relationship between score and view count
    plt.figure(figsize=(6, 4))
    plt.scatter(df['Score'], df['ViewCount'], color='blue', alpha=0.5)

    # Adding labels and title
    plt.title('Scatter Plot: Score vs. View Count')
    plt.xlabel('Score')
    plt.ylabel('View Count')

    # Display the plot
    plt.show()

In [None]:

for collection in random_collections:
    print(collection)
    collection = db[collection]
    views_vs_score(collection)