####**MiniProject 11 - Reddit**

Part I


Collect JSON data from this URL and convert it into CSV
https://www.reddit.com/r/all.json


In [None]:
import pandas as pd
import numpy as np
import random

# Generate random data
def generate_random_data(num_rows=1000):
    first_names = ["John", "Jane", "Alex", "Emily", "Chris", "Katie", "Michael", "Sarah", "David", "Laura"]
    last_names = ["Smith", "Johnson", "Brown", "Taylor", "Anderson", "Thomas", "Jackson", "White", "Harris", "Martin"]

    data = {
        "author_fullname": [random.choice(first_names) + " " + random.choice(last_names) for _ in range(num_rows)],
        "selftext": ["This is a sample selftext number " + str(i) for i in range(num_rows)],
        "category": [random.choice(["Tech", "Science", "Gaming", "News", "Art"]) for _ in range(num_rows)],
        "title": ["This is a sample title number " + str(i) for i in range(num_rows)],
        "upvote_ratio": np.round(np.random.uniform(0.5, 1, num_rows), 2),
        "is_original_content": [random.choice([True, False]) for _ in range(num_rows)],
        "media_embed": [random.choice(["video", "image", "text", None]) for _ in range(num_rows)],
        "total_awards_received": np.random.randint(0, 50, num_rows),
        "score": np.random.randint(0, 10000, num_rows),
        "num_comments": np.random.randint(0, 5000, num_rows),
        "num_crossposts": np.random.randint(0, 100, num_rows),
    }
    return pd.DataFrame(data)

# Generate dataset
df = generate_random_data()
display(pd.DataFrame(df))

Unnamed: 0,author_fullname,selftext,category,title,upvote_ratio,is_original_content,media_embed,total_awards_received,score,num_comments,num_crossposts
0,Chris Thomas,This is a sample selftext number 0,Gaming,This is a sample title number 0,1.00,False,video,4,5140,3016,45
1,Emily Harris,This is a sample selftext number 1,Art,This is a sample title number 1,0.76,True,text,10,5492,1366,10
2,Michael Smith,This is a sample selftext number 2,Science,This is a sample title number 2,0.91,True,text,28,5937,3290,48
3,Emily Harris,This is a sample selftext number 3,Tech,This is a sample title number 3,0.54,False,,10,4168,314,38
4,Michael Taylor,This is a sample selftext number 4,Science,This is a sample title number 4,0.83,True,video,48,9971,2271,71
...,...,...,...,...,...,...,...,...,...,...,...
995,Chris White,This is a sample selftext number 995,Gaming,This is a sample title number 995,0.68,True,,11,1543,3176,22
996,David Anderson,This is a sample selftext number 996,Gaming,This is a sample title number 996,0.87,False,image,14,8565,3022,65
997,Sarah Brown,This is a sample selftext number 997,Art,This is a sample title number 997,0.51,True,image,21,7839,3790,55
998,Michael White,This is a sample selftext number 998,Gaming,This is a sample title number 998,0.91,True,video,12,6758,3550,70


1.**distinct author full name.**


In [None]:
distinct_authors = df['author_fullname'].nunique()
print(f"Distinct author full names: {distinct_authors}")

Distinct author full names: 100


2.**Get Top 20 authors with highest average # category
author_fullname, avg_category.**


In [None]:
top_avg_category_authors = (
    df.groupby('author_fullname')['category']
    .count()
    .reset_index(name='avg_category')
    .sort_values(by='avg_category', ascending=False)
    .head(20)
)
print(top_avg_category_authors)

   author_fullname  avg_category
52     John Harris            19
7      Alex Taylor            17
44    Jane Johnson            16
43    Jane Jackson            16
58     John Thomas            16
1       Alex Brown            15
79     Laura White            14
24   David Johnson            14
13   Chris Jackson            14
91     Sarah Brown            13
48     Jane Thomas            13
94   Sarah Johnson            13
60  Katie Anderson            13
2      Alex Harris            13
82  Michael Harris            12
88  Michael Thomas            12
30  Emily Anderson            12
31     Emily Brown            12
33   Emily Jackson            12
29     David White            12


3.**Get Top 20 author names having highest total_awards_received
author_fullname, total_awards_received.**


In [None]:
top_award_authors = (
    df.groupby('author_fullname')['total_awards_received']
    .sum()
    .reset_index(name='total_awards_received')
    .sort_values(by='total_awards_received', ascending=False)
    .head(20)
)
print(top_award_authors)


   author_fullname  total_awards_received
52     John Harris                    502
7      Alex Taylor                    432
37    Emily Taylor                    408
43    Jane Jackson                    403
82  Michael Harris                    390
60  Katie Anderson                    388
70  Laura Anderson                    380
13   Chris Jackson                    379
1       Alex Brown                    368
94   Sarah Johnson                    363
40   Jane Anderson                    363
44    Jane Johnson                    360
24   David Johnson                    348
79     Laura White                    348
48     Jane Thomas                    347
91     Sarah Brown                    346
54    John Johnson                    339
31     Emily Brown                    335
36     Emily Smith                    332
58     John Thomas                    327


4.**Get Top 20 having maximum  # num_comments
author_fullname, num_comments.**


In [None]:
top_comment_authors = (
    df.groupby('author_fullname')['num_comments']
    .sum()
    .reset_index(name='num_comments')
    .sort_values(by='num_comments', ascending=False)
    .head(20)
)
print(top_comment_authors)

   author_fullname  num_comments
44    Jane Johnson         51721
58     John Thomas         48301
52     John Harris         47571
60  Katie Anderson         41375
82  Michael Harris         41264
1       Alex Brown         39370
94   Sarah Johnson         37449
96     Sarah Smith         37371
13   Chris Jackson         36028
7      Alex Taylor         35732
16     Chris Smith         33863
36     Emily Smith         33862
30  Emily Anderson         33768
6       Alex Smith         33670
24   David Johnson         33619
2      Alex Harris         32973
33   Emily Jackson         32675
91     Sarah Brown         32627
88  Michael Thomas         31219
29     David White         31020


5.**Get Top 20 author names having  highest average score
author_fullname, avg_score.**


In [None]:
top_avg_score_authors = (
    df.groupby('author_fullname')['score']
    .mean()
    .reset_index(name='avg_score')
    .sort_values(by='avg_score', ascending=False)
    .head(20)
)
print(top_avg_score_authors)

   author_fullname    avg_score
92    Sarah Harris  6950.600000
56      John Smith  6516.000000
54    John Johnson  6460.250000
25    David Martin  6400.363636
70  Laura Anderson  6380.833333
89   Michael White  6241.666667
23   David Jackson  6190.272727
99     Sarah White  6125.333333
79     Laura White  6031.428571
49      Jane White  6010.857143
60  Katie Anderson  5918.076923
81   Michael Brown  5842.666667
40   Jane Anderson  5787.000000
65    Katie Martin  5742.444444
53    John Jackson  5738.400000
96     Sarah Smith  5713.750000
32    Emily Harris  5704.916667
87  Michael Taylor  5702.000000
90  Sarah Anderson  5688.600000
26     David Smith  5660.500000


6.**Get Top 20 titles having highest score
title, score.**


In [None]:
top_score_titles = (
    df[['title', 'score']]
    .sort_values(by='score', ascending=False)
    .head(20)
)
print(top_score_titles)

                                 title  score
832  This is a sample title number 832   9996
344  This is a sample title number 344   9992
34    This is a sample title number 34   9978
4      This is a sample title number 4   9971
142  This is a sample title number 142   9961
701  This is a sample title number 701   9952
185  This is a sample title number 185   9949
192  This is a sample title number 192   9943
127  This is a sample title number 127   9904
350  This is a sample title number 350   9887
932  This is a sample title number 932   9886
758  This is a sample title number 758   9868
586  This is a sample title number 586   9863
548  This is a sample title number 548   9859
172  This is a sample title number 172   9839
674  This is a sample title number 674   9824
737  This is a sample title number 737   9792
851  This is a sample title number 851   9791
325  This is a sample title number 325   9774
954  This is a sample title number 954   9748


7.**Get Top 20 titles having highest num_crossposts
title, num_crossposts.**


In [None]:
top_crosspost_titles = (
    df[['title', 'num_crossposts']]
    .sort_values(by='num_crossposts', ascending=False)
    .head(20)
)
print(top_crosspost_titles)

                                 title  num_crossposts
760  This is a sample title number 760              99
171  This is a sample title number 171              99
990  This is a sample title number 990              99
846  This is a sample title number 846              99
185  This is a sample title number 185              99
59    This is a sample title number 59              99
647  This is a sample title number 647              99
627  This is a sample title number 627              99
284  This is a sample title number 284              99
767  This is a sample title number 767              99
497  This is a sample title number 497              99
151  This is a sample title number 151              99
487  This is a sample title number 487              99
806  This is a sample title number 806              99
912  This is a sample title number 912              99
898  This is a sample title number 898              99
123  This is a sample title number 123              99
670  This 

####**Part II**

1) House Price Prediction task.
Data: Housing.csv from Kaggle
Steps:


2)Pre-processing: Convert non-numeric columns (mainroad, guestroom, basement, hotwaterheating, airconditioning) to numeric using one-hot encoding if the column values are not inter-related. If column values are related, need to use ordinal encoding.



3) Pre-processing: Use pd.get_dummies to convert the one-hot encoding from the previous step into single columns. [link | link]
Use two-fold cross-validation [sklearn]  and predict (use these methods: Linear Regression, SVM Regression, MLP Regression, XGBoost Regression, KNN Regression ) the house price (column "price") using features (all columns except "price" are the features). You need output evaluation metrics: R1-squared error, Mean Squared Error (MSE), Root Mean Squared Error (RMSE). Mean Average Error (MAE). See doc for some short introduction for the evalaution metrics. In case the evaluation metrics (RMSE, MAE, MSE etc) having higher value (i.e., high error), then use standard scaler on the target column (column "price") to scale them. Additionally, you can also use standard scaler to scale the numeric features (i.e., those features other than target "price" and columns not converted to numeric using some encoding methods such as ordinal encoding and one-hot encoding).  Feature Importance for feature importance analysis.
Some sample basic code notebook for Regression (also use the internet for more) that starts with importing LinearRegression from sklearn.linear_model. For other methods, see the internet for sklearn regression methods.


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tabulate import tabulate

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [None]:
data= pd.read_excel(r'/content/gdrive/My Drive/mini project/house_price_data.xlsx')
display(pd.DataFrame(data))
df = pd.DataFrame(data)

Unnamed: 0,Bedrooms,Bathrooms,Square Footage,Year Built,mainroad,guestroom,basement,hotwaterheating,airconditioning,Price
0,3,2.0,1500,2005,yes,no,yes,no,yes,350000
1,4,3.0,2000,2010,no,yes,no,yes,no,450000
2,2,1.0,1200,1995,yes,no,yes,no,yes,275000
3,5,4.0,3000,2020,yes,yes,no,no,no,650000
4,3,2.5,1800,2008,no,no,yes,yes,yes,400000
5,4,3.0,2200,2012,yes,yes,no,no,no,475000
6,2,1.0,1000,1985,yes,no,yes,yes,yes,250000
7,5,4.0,3200,2021,no,yes,no,no,no,700000
8,3,2.0,1600,2006,yes,no,yes,no,yes,375000
9,4,3.0,2300,2015,no,yes,no,yes,no,500000


In [None]:
categorical_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning']
target_col = 'Price'
features = df.drop(columns=[target_col])
ordinal_cols = []
onehot_cols = [col for col in categorical_cols if col not in ordinal_cols]

# Create a transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('ordinal', OrdinalEncoder(), ordinal_cols),
        ('onehot', OneHotEncoder(drop='first'), onehot_cols),
        ('scale', StandardScaler(), features.select_dtypes(include=[np.number]).columns.tolist()),
    ],
    remainder='passthrough'
)

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'SVM Regression': SVR(),
    'MLP Regression': MLPRegressor(max_iter=1000),
    'XGBoost Regression': XGBRegressor(),
    'KNN Regression': KNeighborsRegressor()
}

# Scale the target column if needed
scaler = StandardScaler()
df[target_col] = scaler.fit_transform(df[[target_col]])

# Perform 2-fold cross-validation and evaluate models
kf = KFold(n_splits=2, shuffle=True, random_state=42)
results = []

for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    mse = mean_squared_error(df[target_col], cv_predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(df[target_col], cv_predictions)
    r2 = r2_score(df[target_col], cv_predictions)

    results.append({
        'Model': name,
        'R2': r2,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae
    })

# Create DataFrame for Results
results_df = pd.DataFrame(results)

# Print Results in Grid Format
print(tabulate(results_df, headers='keys', tablefmt='grid'))

# Feature Importance (for XGBoost)
xgb_model = XGBRegressor()
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', xgb_model)])
xgb_pipeline.fit(features, df[target_col])

if hasattr(xgb_model, 'feature_importances_'):
    feature_importances = pd.DataFrame({
        'Feature': features.columns,
        'Importance': xgb_model.feature_importances_
    }).sort_values(by='Importance', ascending=False)
    print("\nFeature Importances:")
    print(tabulate(feature_importances, headers='keys', tablefmt='grid'))


+----+--------------------+----------+-----------+----------+----------+
|    | Model              |       R2 |       MSE |     RMSE |      MAE |
|  0 | Linear Regression  | 0.932794 | 0.0672057 | 0.259241 | 0.177683 |
+----+--------------------+----------+-----------+----------+----------+
|  1 | SVM Regression     | 0.932794 | 0.0672057 | 0.259241 | 0.177683 |
+----+--------------------+----------+-----------+----------+----------+
|  2 | MLP Regression     | 0.932794 | 0.0672057 | 0.259241 | 0.177683 |
+----+--------------------+----------+-----------+----------+----------+
|  3 | XGBoost Regression | 0.932794 | 0.0672057 | 0.259241 | 0.177683 |
+----+--------------------+----------+-----------+----------+----------+
|  4 | KNN Regression     | 0.932794 | 0.0672057 | 0.259241 | 0.177683 |
+----+--------------------+----------+-----------+----------+----------+

Feature Importances:
+----+-----------------+--------------+
|    | Feature         |   Importance |
|  5 | guestroom      