In [None]:
!pip install pyspark
!pip install findspark
!pip install pandas
!pip install matplotlib

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=bb8f0ca15d31578894fec6150841e9944693018ec47f55d6c917669a9b186d78
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Read the CSV file into a pandas DataFrame
file_path = './dataset/filtered_output.csv'
data = pd.read_csv(file_path)

# Display information about missing values
print("Missing values:")
print(data.isnull().sum())

num_rows = len(data)
print("Number of rows in the DataFrame:", num_rows)

Missing values:
name                              0
stars                             0
forks                             0
watchers                          0
pullRequests                      0
primaryLanguage                   0
defaultBranchCommitCount          2
createdAt                         0
license                     1378490
dtype: int64
Number of rows in the DataFrame: 3021631


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import warnings
from matplotlib.ticker import FuncFormatter
from sklearn.metrics import precision_score, recall_score, f1_score

# Filter columns for modeling
selected_columns = ['name', 'stars', 'forks', 'watchers', 'pullRequests', 'primaryLanguage', 'defaultBranchCommitCount', 'createdAt', 'license']
model_data = data[selected_columns].copy()  # Make a copy to avoid SettingWithCopyWarning

# Convert 'createdAt' to datetime explicitly
data['createdAt'] = pd.to_datetime(data['createdAt'])

# Extract year from the 'createdAt' column using .dt accessor
data['year'] = data['createdAt'].dt.year

# Filter columns for modeling
selected_columns = ['name', 'primaryLanguage', 'year']  # Include 'year' in selected columns
model_data = data[selected_columns].copy()  # Make a copy to avoid SettingWithCopyWarning

# Calculate language growth based on year
language_growth = model_data.groupby(['primaryLanguage', 'year']).size().unstack(fill_value=0)

# Define the response variable (language growth)
language_growth['growth'] = language_growth.diff(axis=1).fillna(0).apply(lambda x: x.gt(0).sum(), axis=1)

# Reset index to make 'primary_language' a column
language_growth.reset_index(inplace=True)

# Merge 'growth' information back to the original dataset
model_data = pd.merge(model_data, language_growth[['primaryLanguage', 'growth']], on='primaryLanguage')

# Encode categorical variables
label_encoder = LabelEncoder()
model_data['name'] = label_encoder.fit_transform(model_data['name'])
model_data['primaryLanguage'] = label_encoder.fit_transform(model_data['primaryLanguage'])

warnings.filterwarnings("ignore", message="The least populated class in y has only 1 members")
# Define features and target variable
X = model_data[['name', 'primaryLanguage']]
y = model_data['growth']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a decision tree classifier with constraints
clf = DecisionTreeClassifier(random_state=42, max_depth=5, min_samples_split=5, min_samples_leaf=4)

# Perform cross-validation to assess the model's performance
cv_scores = cross_val_score(clf, X, y, cv=5)  # Using 5-fold cross-validation
average_accuracy = cv_scores.mean() * 100  # Calculating average accuracy in percentage
# Suppress the specific UserWarning


print("Accuracy of the model (after cross-validation): {:.2f}%".format(average_accuracy))

# Initialize StratifiedKFold for cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation with stratified k-fold
cv_scores = cross_val_score(clf, X, y, cv=skf)
average_accuracy = cv_scores.mean() * 100

print("Accuracy of the model (after stratified cross-validation): {:.2f}%".format(average_accuracy))

# Fit the model on the training data
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)


# Calculate precision, recall, and F1 score with zero_division handling
precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)

print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1 Score: {:.2f}".format(f1))
print()

# Reset the warning filters to default (if needed)
warnings.filterwarnings("default")

# Filter out the top  growing languages
top_growing_languages = language_growth.sort_values('growth', ascending=False).head(5)

# Filter out the top  declining languages
top_declining_languages = language_growth.sort_values('growth', ascending=True).head(5)

# Extracting top 10 growing languages
top_10_growing = language_growth.sort_values('growth', ascending=False).head(10)

# Displaying the top 10 growing languages without growth percentages
print("Top Languages That Might Become Popular in Future:")
for lang in top_10_growing['primaryLanguage']:
    print(lang)

# Adding a line gap
print()

# Extracting top  declining languages
top_10_declining = language_growth.sort_values('growth', ascending=True).head(5)

# Displaying the top  declining languages without growth percentages
print("Top Languages That Might Become Extinct:")
for lang in top_10_declining['primaryLanguage']:
    print(lang)


Accuracy of the model (after cross-validation): 67.01%
Accuracy of the model (after stratified cross-validation): 82.10%
Precision: 0.83
Recall: 0.82
F1 Score: 0.79

Top Languages That Might Become Popular in Future:
TypeScript
Nix
Rust
Batchfile
PLpgSQL
Go
SCSS
Kotlin
HTML
Jupyter Notebook

Top Languages That Might Become Extinct:
Ragel in Ruby Host
Gradle
desktop
Quake
WGSL
