<div style="text-align: center; font-weight: bold; font-size: 24px;">
    DS542 - DevGPT
</div>


#### Question 7: How accurately can we predict the length of a conversation with ChatGPT?

---

In [127]:
import pandas as pd
import altair as alt
import numpy as np
import json
import sklearn
import spacy
from spacy import displacy
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline

In [128]:
# Reads in JSON files
discuss_json = pd.read_json('20230727_195954_discussion_sharings.json')
issues_json = pd.read_json('20230727_195941_issue_sharings.json')

In [129]:
# Normalizes 'Sources' column
discuss_df = pd.json_normalize(discuss_json['Sources'], sep='_')

# Normalizes 'ChatgptSharing' column
discuss_df2 = pd.json_normalize(
    discuss_df['ChatgptSharing'].explode(),
    sep='_'
)

# Normalizes and explodes 'Conversations'
discuss_df3 = pd.json_normalize(
    discuss_df2['Conversations'].explode(),
    sep='_'
)

# Merges dataframes
merged_df1 = pd.merge(discuss_df, discuss_df2, left_index=True, right_index=True, how='outer')
final_df = pd.merge(merged_df1, discuss_df3, left_index=True, right_index=True, how='outer')
print(final_df.columns)

date_of_conversation = final_df['DateOfConversation']
print(date_of_conversation.head())

mentioned_url = final_df['Mention_MentionedURL']
print(mentioned_url.head())

Index(['Type', 'URL_x', 'Author', 'RepoName', 'RepoLanguage', 'Number',
       'Title_x', 'Body', 'CreatedAt', 'ClosedAt', 'UpdatedAt', 'Closed',
       'UpvoteCount', 'ChatgptSharing', 'URL_y', 'Status',
       'DateOfConversation', 'DateOfAccess', 'Title_y', 'NumberOfPrompts',
       'TokensOfPrompts', 'TokensOfAnswers', 'Model', 'Conversations',
       'HTMLContent', 'Mention_MentionedURL', 'Mention_MentionedProperty',
       'Mention_MentionedAuthor', 'Mention_MentionedText',
       'Mention_MentionedIsAnswer', 'Mention_MentionedUpvoteCount', 'Prompt',
       'Answer', 'ListOfCode'],
      dtype='object')
0    July 11, 2023
1    July 11, 2023
2    July 15, 2023
3    July 14, 2023
4    June 26, 2023
Name: DateOfConversation, dtype: object
0    https://github.com/deep-foundation/Discussions...
1    https://github.com/orgs/deep-foundation/discus...
2    https://github.com/JushBJJ/Mr.-Ranedeer-AI-Tut...
3    https://github.com/JushBJJ/Mr.-Ranedeer-AI-Tut...
4    https://github.com/dtch

In [130]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 179 entries, 0 to 178
Data columns (total 34 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Type                          32 non-null     object 
 1   URL_x                         32 non-null     object 
 2   Author                        32 non-null     object 
 3   RepoName                      32 non-null     object 
 4   RepoLanguage                  27 non-null     object 
 5   Number                        32 non-null     float64
 6   Title_x                       32 non-null     object 
 7   Body                          32 non-null     object 
 8   CreatedAt                     32 non-null     object 
 9   ClosedAt                      4 non-null      object 
 10  UpdatedAt                     32 non-null     object 
 11  Closed                        32 non-null     object 
 12  UpvoteCount                   32 non-null     float64
 13  ChatgptSha

In [131]:
# Evaluates summary statistics
final_df.describe()

Unnamed: 0,Number,UpvoteCount,Status,NumberOfPrompts,TokensOfPrompts,TokensOfAnswers,Mention_MentionedUpvoteCount
count,32.0,32.0,38.0,33.0,33.0,33.0,19.0
mean,537.1875,1.75,226.842105,5.272727,596.878788,1728.393939,1.052632
std,1275.809533,2.962127,69.884277,6.746632,845.003615,2494.059391,0.524265
min,1.0,0.0,200.0,1.0,15.0,25.0,0.0
25%,6.75,1.0,200.0,1.0,41.0,260.0,1.0
50%,136.5,1.0,200.0,3.0,200.0,496.0,1.0
75%,368.5,1.0,200.0,6.0,809.0,2566.0,1.0
max,5758.0,17.0,404.0,30.0,3084.0,12044.0,3.0


In [132]:
# Number of turns in each conversation
final_df['ConversationTurns'] = final_df['Conversations'].apply(lambda x: len(x) if isinstance(x, list) else 0)

# Average number of turns
average_turns = final_df['ConversationTurns'].mean()
print(f"Average number of turns per conversation: {average_turns}")
print(final_df[['Prompt', 'ConversationTurns']].head())

Average number of turns per conversation: 0.9720670391061452
                                              Prompt  ConversationTurns
0  Can I always use await import instead of plain...                  1
1  Can I always use await import instead of plain...                  1
2  ===\nAuthor: JushBJJ\nName: "Mr. Ranedeer"\nVe...                  1
3  [Personalization Options]\n    Language: ["Eng...                  1
4  You are an agent in a gridworld.\nThe environm...                  3


In [113]:
# Included to avoid error message
pd.set_option('future.no_silent_downcasting', True)

# Replaces 'Default' in string columns
string_columns = final_df.select_dtypes(include=['object']).columns
final_df[string_columns] = final_df[string_columns].replace('Default', np.nan)

# Converts columns to numeric
columns_to_clean = ['TokensOfPrompts', 'NumberOfPrompts', 'TokensOfAnswers']
for col in columns_to_clean:
    final_df[col] = pd.to_numeric(final_df[col], errors='coerce')

# Fills NaNs with 0
final_df.fillna(0, inplace=True)

In [114]:
# Fills missing values in the 'Prompt' column with empty string
final_df['Prompt_clean'] = final_df['Prompt'].fillna('').str.lower().str.replace('[^\\w\\s]', '', regex=True)
print(final_df['Prompt_clean'])

0      can i always use await import instead of plain...
1      can i always use await import instead of plain...
2      \nauthor jushbjj\nname mr ranedeer\nversion 27...
3      personalization options\n    language english ...
4      you are an agent in a gridworld\nthe environme...
                             ...                        
174                                             and more
175                                                 more
176    data lake and data warehouse are bad names the...
177                            give me some better names
178                               give me some fun names
Name: Prompt_clean, Length: 179, dtype: object


In [115]:
nlp = spacy.load("en_core_web_sm")

In [116]:
final_df['Prompt_clean'] = final_df['Prompt_clean'].astype(str)
final_df['Prompt_clean'].apply(nlp)
final_df['Prompt_tokens'] = final_df['Prompt_clean'].apply(lambda x: [token.text for token in nlp(x)])

Preprocesses`final_df` data to move forward with predictions:

In [117]:
# Creates new featue that representes number of words in prompt column
final_df['PromptLength'] = final_df['Prompt_clean'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)

# Defines features and target
features = ['TokensOfPrompts', 'PromptLength', 'NumberOfPrompts', 'Model']
target = 'ConversationTurns'

# Drops rows with missing values
final_df.dropna(subset=features + [target], inplace=True)
X = final_df[features]
y = final_df[target]

# Scales 
preprocessor = ColumnTransformer(
    transformers=[
        ('model', OneHotEncoder(), ['Model']),  
        ('num', StandardScaler(), ['TokensOfPrompts', 'PromptLength', 'NumberOfPrompts'])  
    ])

Splitting `X` and `y` data and model training:

In [118]:
# Splits data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2025)

# Pipeline with preprocessing and random forest model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=2025))
])

# Trains model
pipeline.fit(X_train, y_train)

param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print("Best parameters from Grid Search:", grid_search.best_params_)

Best parameters from Grid Search: {'regressor__max_depth': None, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 200}


- hyperparameters suggest flexible model that captures complex relationships in the data, but could be prone to overfitting
- model could handle a variety of data points b/c able to have deep trees and a high number of estimators.

In [119]:
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)

In [120]:
# Calculates mae and rmse
mae = mean_absolute_error(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
r2 = r2_score(y_test, predictions)


print(f"mean absolute error: {round(mae,2)}")
print(f"root mean squared error: {round(rmse,2)}")
print(f"R-squared: {round(r2, 2)}")

cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='neg_mean_squared_error')
print(f"Mean Cross-Validation MSE: {round(-cv_scores.mean(),2)}")

mean absolute error: 0.16
root mean squared error: 0.46
R-squared: 0.93
Mean Cross-Validation MSE: 3.28


- mae of 0.16 is relatively low. predictions are within ~0.16 of the true values on average.
- model is quite accurate in predicting conversation turns but could be overfitting

- RMSE indicates that model's predictions deviate from the true values by around 0.46
- is slightly higher than the MAE but still suggests the model is reasonably accurate. 

- The MSE of 3.93 is significantly higher - model may not generalize as well to new data. Look at XGboost (?)

In [121]:
# Predicts conversation turns
predictions_full = best_model.predict(X)
print(f"Predictions on full data: {predictions_full[:10]}")

Predictions on full data: [0.975 0.975 2.82  1.185 4.225 4.225 2.79  1.01  1.99  5.09 ]


In [122]:
len(predictions_full)

179

In [123]:
# Checks model inside pipeline
print(best_model.named_steps)

{'preprocessor': ColumnTransformer(transformers=[('model', OneHotEncoder(), ['Model']),
                                ('num', StandardScaler(),
                                 ['TokensOfPrompts', 'PromptLength',
                                  'NumberOfPrompts'])]), 'regressor': RandomForestRegressor(n_estimators=200, random_state=2025)}


In [124]:
# Predictions
predict_y = best_model.predict(X_train)

# Actual vs. predicted vals
c_targets = pd.DataFrame(
    {
        "y": y_train,
        "pred_y": predict_y.tolist(),
    }
).sort_values(by="pred_y", ascending=False) 

c_targets.head()

Unnamed: 0,y,pred_y
17,30,25.86
32,20,19.405
36,20,18.225
13,15,13.92
22,10,8.47


In [125]:
# Plot of actual vs. pred
chart = alt.Chart(c_targets).mark_point().encode(
    x=alt.X('y', title="Actual Number of Turns"),
    y=alt.Y('pred_y', title="Predicted Number of Turns"),
    tooltip=['y', 'pred_y']
).properties(width=600, height=400,
    title='Actual vs Predicted Y Values'
)

# Add a line for ideal prediction (y = x)
ideal_line = alt.Chart(c_targets).mark_line(color='green').encode(
    x='y',
    y='y'
)

final_chart = chart + ideal_line
final_chart