In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

Part 1

In [3]:
# Load data from a CSV file or any other data source
data = pd.read_csv("41-50.csv")

In [4]:
# Load data here and calculate the citation ratio and normalize the data

# Calculate the citation ratio (2022/2021)
data['citation_ratio'] = data['cit_2022'] / data['cit_2021']

# Select the columns for citation data from 2017 to 2022
citation_columns = ['cit_2017', 'cit_2018', 'cit_2019', 'cit_2020', 'cit_2021', 'cit_2022']

# Normalize the citation data using Min-Max scaling
scaler = MinMaxScaler()
data[citation_columns] = scaler.fit_transform(data[citation_columns])

# Convert the labels (categories) to numerical values
data['category'] = data['citation_ratio'].apply(lambda ratio: 0 if ratio < 1.05 else (1 if ratio <= 1.15 else 2))
data['category'] = data['category'].astype('category')

# Split the data
X = data[citation_columns].values
y = data['category']  

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now, 'data' contains the normalized citation data and their corresponding categories


In [5]:
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 75.00%


Part 2

In [6]:
# Load data from a CSV file or any other data source
data_new = pd.read_csv("41-50.csv")

In [7]:
# Calculate new features
for year in range(2017, 2022):
    data_new[f'citation_change_{year}'] = (data_new[f'cit_{year+1}'] - data_new[f'cit_{year}']) / data_new[f'cit_{year}']

# Calculate the citation ratio (2022/2021)
data_new['citation_ratio'] = data_new['cit_2022'] / data_new['cit_2021']

# Convert the labels (categories) to numerical values
data['category'] = data['citation_ratio'].apply(lambda ratio: 0 if ratio < 1.05 else (1 if ratio <= 1.15 else 2))
data['category'] = data['category'].astype('category')

# Separate features (X) and target variable (y)
X_new = data_new.drop(['univ_rank', 'first_initial', 'last_initial'], axis=1)

# Drop the columns related to raw citation numbers
X_new = X_new.drop([f'cit_{year}' for year in range(2017, 2022)], axis=1)

# Select only the new features
new_features = [f'citation_change_{year}' for year in range(2017, 2022)]
X_new_features = X_new[new_features]

# Handle division by zero or very small values
X_new_features.replace([np.inf, -np.inf], np.nan, inplace=True)
X_new_features.fillna(0, inplace=True)  # Replace NaN with 0

y_new = data['category']


# Split the data into training and testing sets
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new_features, y_new, test_size=0.2, random_state=42)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_new_features.replace([np.inf, -np.inf], np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_new_features.fillna(0, inplace=True)  # Replace NaN with 0


In [9]:
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train_new, y_train_new)

# Make predictions on the test data
y_pred_new = rf_classifier.predict(X_test_new)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test_new, y_pred_new)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 100.00%
