In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Load the data
data = pd.read_csv('../data/final.csv')

# Split the data into training and test sets
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

# Extract the text and originality score columns from the training data
train_text = train_data['text']
train_scores = train_data['originality_score']

# Create a TF-IDF vectorizer to extract features from the text
vectorizer = TfidfVectorizer()
train_features = vectorizer.fit_transform(train_text)

# Train a linear regression model on the features and scores
regressor = LinearRegression()
regressor.fit(train_features, train_scores)

# Extract the text column from the test data
test_text = test_data['text']

# Extract features from the test text using the same vectorizer as before
test_features = vectorizer.transform(test_text)

# Use the trained regression model to predict the originality scores for the test data
predicted_scores = regressor.predict(test_features)

import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

# Compute the mean squared error (MSE) and R-squared (R2) for the test set
test_scores = test_data['originality_score']
mse = mean_squared_error(test_scores, predicted_scores)
r2 = r2_score(test_scores, predicted_scores)

print(f'Mean squared error (MSE): {mse:.4f}')
print(f'R-squared (R2): {r2:.4f}')


Mean squared error (MSE): 523.4842
R-squared (R2): 0.1355


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the data
data = pd.read_csv('../data/final.csv')

# Split the data into training and test sets
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

# Extract the text and originality score columns from the training data
train_text = train_data['text']
train_scores = train_data['originality_score']

# Create a TF-IDF vectorizer to extract features from the text
vectorizer = TfidfVectorizer()
train_features = vectorizer.fit_transform(train_text)

# Define a list of regression algorithms to use
regressors = [
    LinearRegression(),
    Ridge(alpha=0.1),
    Lasso(alpha=0.1),
    ElasticNet(alpha=0.1, l1_ratio=0.5),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    MLPRegressor()
]

# Train and evaluate each regression algorithm
for regressor in regressors:
    # Train the model on the features and scores
    regressor.fit(train_features, train_scores)

    # Extract the text column from the test data
    test_text = test_data['text']

    # Extract features from the test text using the same vectorizer as before
    test_features = vectorizer.transform(test_text)

    # Use the trained regression model to predict the originality scores for the test data
    predicted_scores = regressor.predict(test_features)

    # Compute the mean squared error (MSE) and R-squared (R2) for the test set
    test_scores = test_data['originality_score']
    mse = mean_squared_error(test_scores, predicted_scores)
    r2 = r2_score(test_scores, predicted_scores)

    # Print the results
    print(f'Regressor: {regressor.__class__.__name__}')
    print(f'Mean squared error (MSE): {mse:.4f}')
    print(f'R-squared (R2): {r2:.4f}')
    print()


Regressor: LinearRegression
Mean squared error (MSE): 523.4842
R-squared (R2): 0.1355

Regressor: Ridge
Mean squared error (MSE): 513.5818
R-squared (R2): 0.1518

Regressor: Lasso
Mean squared error (MSE): 591.1206
R-squared (R2): 0.0238

Regressor: ElasticNet
Mean squared error (MSE): 602.3819
R-squared (R2): 0.0052

Regressor: DecisionTreeRegressor
Mean squared error (MSE): 1065.4946
R-squared (R2): -0.7597

Regressor: RandomForestRegressor
Mean squared error (MSE): 564.2367
R-squared (R2): 0.0682

Regressor: MLPRegressor
Mean squared error (MSE): 519.7640
R-squared (R2): 0.1416





In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the data
data = pd.read_csv('../data/final_unscaled.csv')

# Split the data into training and test sets
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

# Extract the text and originality score columns from the training data
train_text = train_data['text']
train_scores = train_data['originality_score']

# Create a TF-IDF vectorizer to extract features from the text
vectorizer = TfidfVectorizer()
train_features = vectorizer.fit_transform(train_text)

# Train a linear regression model on the features and scores
lin_regressor = LinearRegression()
lin_regressor.fit(train_features, train_scores)

# Train an SVR model on the features and scores
svr_regressor = SVR(kernel='linear')
svr_regressor.fit(train_features, train_scores)

# Train a random forest regression model on the features and scores
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(train_features, train_scores)

# Extract the text column from the test data
test_text = test_data['text']

# Extract features from the test text using the same vectorizer as before
test_features = vectorizer.transform(test_text)

# Use the trained models to predict the originality scores for the test data
lin_predicted_scores = lin_regressor.predict(test_features)
svr_predicted_scores = svr_regressor.predict(test_features)
rf_predicted_scores = rf_regressor.predict(test_features)

# Compute the mean squared error (MSE) and R-squared (R2) for the test set
test_scores = test_data['originality_score']
lin_mse = mean_squared_error(test_scores, lin_predicted_scores)
lin_r2 = r2_score(test_scores, lin_predicted_scores)

svr_mse = mean_squared_error(test_scores, svr_predicted_scores)
svr_r2 = r2_score(test_scores, svr_predicted_scores)

rf_mse = mean_squared_error(test_scores, rf_predicted_scores)
rf_r2 = r2_score(test_scores, rf_predicted_scores)

# Print the results
print("Linear Regression:")
print(f'Mean squared error (MSE): {lin_mse:.4f}')
print(f'R-squared (R2): {lin_r2:.4f}')

print("\nSupport Vector Regression:")
print(f'Mean squared error (MSE): {svr_mse:.4f}')
print(f'R-squared (R2): {svr_r2:.4f}')

print("\nRandom Forest Regression:")
print(f'Mean squared error (MSE): {rf_mse:.4f}')
print(f'R-squared (R2): {rf_r2:.4f}')

Linear Regression:
Mean squared error (MSE): 24.1416
R-squared (R2): -0.3128

Support Vector Regression:
Mean squared error (MSE): 18.7980
R-squared (R2): -0.0222

Random Forest Regression:
Mean squared error (MSE): 21.0656
R-squared (R2): -0.1455


In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the data
data = pd.read_csv('../data/final_unscaled.csv')

# Split the data into training and test sets
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

# Extract the text and originality score columns from the training data
train_text = train_data['text']
train_scores = train_data['originality_score']

# Create a TF-IDF vectorizer to extract features from the text
vectorizer = TfidfVectorizer()
train_features = vectorizer.fit_transform(train_text)

# Define a list of regression algorithms to use
regressors = [
    LinearRegression(),
    Ridge(alpha=0.1),
    Lasso(alpha=0.1),
    ElasticNet(alpha=0.1, l1_ratio=0.5),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    MLPRegressor()
]

# Train and evaluate each regression algorithm
for regressor in regressors:
    # Train the model on the features and scores
    regressor.fit(train_features, train_scores)

    # Extract the text column from the test data
    test_text = test_data['text']

    # Extract features from the test text using the same vectorizer as before
    test_features = vectorizer.transform(test_text)

    # Use the trained regression model to predict the originality scores for the test data
    predicted_scores = regressor.predict(test_features)

    # Compute the mean squared error (MSE) and R-squared (R2) for the test set
    test_scores = test_data['originality_score']
    mse = mean_squared_error(test_scores, predicted_scores)
    r2 = r2_score(test_scores, predicted_scores)

    # Print the results
    print(f'Regressor: {regressor.__class__.__name__}')
    print(f'Mean squared error (MSE): {mse:.4f}')
    print(f'R-squared (R2): {r2:.4f}')
    print()


Regressor: LinearRegression
Mean squared error (MSE): 24.1416
R-squared (R2): -0.3128

Regressor: Ridge
Mean squared error (MSE): 22.9389
R-squared (R2): -0.2474

Regressor: Lasso
Mean squared error (MSE): 18.5832
R-squared (R2): -0.0106

Regressor: ElasticNet
Mean squared error (MSE): 18.5832
R-squared (R2): -0.0106

Regressor: DecisionTreeRegressor
Mean squared error (MSE): 94.3468
R-squared (R2): -4.1306

Regressor: RandomForestRegressor
Mean squared error (MSE): 21.8464
R-squared (R2): -0.1880

Regressor: MLPRegressor
Mean squared error (MSE): 58.4414
R-squared (R2): -2.1781





In [19]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the data
data = pd.read_csv('../data/final.csv')

# Split the data into training and test sets
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

# Extract the text and originality score columns from the training data
train_text = train_data['text']
train_scores = train_data['originality_score']

# Create a TF-IDF vectorizer to extract features from the text
vectorizer = TfidfVectorizer()
train_features = vectorizer.fit_transform(train_text)

# Define a list of regression algorithms to use
regressors = [
    LinearRegression(),
    Ridge(alpha=0.1),
    Lasso(alpha=0.1),
    ElasticNet(alpha=0.1, l1_ratio=0.5),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    MLPRegressor()
]

# Train and evaluate each regression algorithm
for regressor in regressors:
    # Train the model on the features and scores
    regressor.fit(train_features, train_scores)

    # Extract the text column from the test data
    test_text = test_data['text']

    # Extract features from the test text using the same vectorizer as before
    test_features = vectorizer.transform(test_text)

    # Use the trained regression model to predict the originality scores for the test data
    predicted_scores = regressor.predict(test_features)

    # Compute the mean squared error (MSE) and R-squared (R2) for the test set
    test_scores = test_data['originality_score']
    mse = mean_squared_error(test_scores, predicted_scores)
    r2 = r2_score(test_scores, predicted_scores)

    # Print the results
    print(f'Regressor: {regressor.__class__.__name__}')
    print(f'Mean squared error (MSE): {mse:.4f}')
    print(f'R-squared (R2): {r2:.4f}')
    print()


Regressor: LinearRegression
Mean squared error (MSE): 523.4842
R-squared (R2): 0.1355

Regressor: Ridge
Mean squared error (MSE): 513.5818
R-squared (R2): 0.1518

Regressor: Lasso
Mean squared error (MSE): 591.1206
R-squared (R2): 0.0238

Regressor: ElasticNet
Mean squared error (MSE): 602.3819
R-squared (R2): 0.0052

Regressor: DecisionTreeRegressor
Mean squared error (MSE): 982.8065
R-squared (R2): -0.6231

Regressor: RandomForestRegressor
Mean squared error (MSE): 565.8297
R-squared (R2): 0.0655

Regressor: MLPRegressor
Mean squared error (MSE): 518.6397
R-squared (R2): 0.1435





In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

data = pd.read_csv('../data/final_unscaled.csv')

train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

# Extract the text and originality score columns from the training data
train_text = train_data['text']
train_scores = train_data['originality_score']

# Define a pipeline to preprocess the text and train the regression model
pipeline = Pipeline([
('tfidf', TfidfVectorizer()),
('scaler', StandardScaler(with_mean=False)),
('regressor', RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42))
])

# Train the pipeline on the training data
pipeline.fit(train_text, train_scores)

# Evaluate the pipeline on the test data
test_text = test_data['text']
predicted_scores = pipeline.predict(test_text)

mse = mean_squared_error(test_data['originality_score'], predicted_scores)
r2 = r2_score(test_data['originality_score'], predicted_scores)

print(f"Mean squared error (MSE): {mse:.4f}")
print(f"R-squared (R2): {r2:.4f}")

Mean squared error (MSE): 21.3229
R-squared (R2): -0.1595
