In [5]:
# --------------------------------------------------
# 1. IMPORTS
# --------------------------------------------------
import pandas as pd
import numpy as np
import zipfile, os, shutil
from google.colab import files
import ipywidgets as widgets
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")

# Try importing XGBoost and CatBoost
try:
    from xgboost import XGBRegressor
except:
    !pip install xgboost
    from xgboost import XGBRegressor

try:
    from catboost import CatBoostRegressor
except:
    !pip install catboost
    from catboost import CatBoostRegressor

# --------------------------------------------------
# 2. FILE UPLOAD (ZIP or EXCEL)
# --------------------------------------------------
print("Upload ZIP or Excel file…")
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
print("Uploaded:", file_name)

# Create extraction folder
extract_dir = "/content/extracted_excel"
if os.path.exists(extract_dir):
    shutil.rmtree(extract_dir)
os.makedirs(extract_dir, exist_ok=True)

excel_files = []

# --------------------------------------------------
# 3. CHECK IF ZIP OR EXCEL
# --------------------------------------------------
if file_name.endswith(".zip"):
    print(" Extracting ZIP…")
    with zipfile.ZipFile(file_name, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    for root, dirs, files_list in os.walk(extract_dir):
        for f in files_list:
            if f.endswith(".xlsx") or f.endswith(".xls"):
                excel_files.append(os.path.join(root, f))
else:
    new_path = os.path.join(extract_dir, file_name)
    shutil.move(file_name, new_path)
    excel_files.append(new_path)

print("\nExcel files found:", excel_files)

if len(excel_files) == 0:
    raise ValueError("No Excel files found!")

# --------------------------------------------------
# 4. DROPDOWN FILE SELECTOR
# --------------------------------------------------
dropdown = widgets.Dropdown(
    options=excel_files,
    description='Choose Excel File:',
    layout=widgets.Layout(width='600px')
)
display(dropdown)
print("Select the file above, then click RUN AGAIN to continue.")

# --------------------------------------------------
# 5. LOAD THE SELECTED FILE
# --------------------------------------------------
excel_path = dropdown.value
print("\nUsing Excel file:", excel_path)
df = pd.read_excel(excel_path)
print("\nColumn Names:", list(df.columns))

# --------------------------------------------------
# 6. AUTO-DETECT TARGET COLUMN (Y)
# --------------------------------------------------
preferred_targets = ["value", "market_value", "price", "mv", "y"]
y_col = None
for col in df.columns:
    if col.lower() in preferred_targets:
        y_col = col
        break

if y_col is None:
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if len(numeric_cols) == 0:
        raise ValueError("No numeric target column found")
    y_col = numeric_cols[0]

print("Auto-selected Target Column (Y):", y_col)

# --------------------------------------------------
# 7. SELECT NUMERIC FEATURES (X)
# --------------------------------------------------
X = df.drop(columns=[y_col], errors='ignore')
y = pd.to_numeric(df[y_col], errors='coerce')

df = df.dropna(subset=[y_col])
X = X.select_dtypes(include=['float64', 'int64'])

if X.shape[1] == 0:
    raise ValueError("No numeric features available!")

print("Numeric Features:", list(X.columns))

# --------------------------------------------------
# 8. HANDLE SKEWED TARGET WITH LOG TRANSFORMATION
# --------------------------------------------------
if abs(y.skew()) > 1:
    print("Applying log1p to Y because it is highly skewed...")
    y = np.log1p(y)
    log_flag = True
else:
    log_flag = False

# --------------------------------------------------
# 9. TRAIN-TEST SPLIT
# --------------------------------------------------
X_train, X_test, Y_train, Y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --------------------------------------------------
# 10. PREPROCESS PIPELINE
# --------------------------------------------------
preprocess = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

X_train_trans = preprocess.fit_transform(X_train)
X_test_trans = preprocess.transform(X_test)

# Convert back to DataFrame for feature selection
X_train = pd.DataFrame(X_train_trans, columns=X.columns)
X_test = pd.DataFrame(X_test_trans, columns=X.columns)

# --------------------------------------------------
# 11. LASSO FEATURE SELECTION
# --------------------------------------------------
print("\nRunning LassoCV for feature selection...")
lasso = LassoCV(cv=5)
lasso.fit(X_train, Y_train)
selected_idx = np.where(lasso.coef_ != 0)[0]
selected_features = X.columns[selected_idx].tolist()
if len(selected_features) == 0:
    selected_features = X.columns.tolist()
print("Selected Features after Lasso:", selected_features)

X_train = X_train[selected_features]
X_test = X_test[selected_features]

# --------------------------------------------------
# 12. RANDOM FOREST FEATURE IMPORTANCE
# --------------------------------------------------
print("\nRunning Random Forest for feature importance...")
rf = RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42)
rf.fit(X_train, Y_train)
rf_importances = pd.Series(rf.feature_importances_, index=X_train.columns)
top_rf_features = rf_importances[rf_importances > np.mean(rf_importances)].index.tolist()
if len(top_rf_features) == 0:
    top_rf_features = X_train.columns.tolist()
print("Top Features after RF:", top_rf_features)

X_train = X_train[top_rf_features]
X_test = X_test[top_rf_features]

# --------------------------------------------------
# 13. XGBOOST FEATURE IMPORTANCE
# --------------------------------------------------
print("\nRunning XGBoost for feature importance...")
xgb = XGBRegressor(n_estimators=200, max_depth=6, learning_rate=0.1, random_state=42, verbosity=0)
xgb.fit(X_train, Y_train)
xgb_importances = pd.Series(xgb.feature_importances_, index=X_train.columns)
top_xgb_features = xgb_importances[xgb_importances > np.mean(xgb_importances)].index.tolist()
if len(top_xgb_features) == 0:
    top_xgb_features = X_train.columns.tolist()
print("Top Features after XGBoost:", top_xgb_features)

X_train = X_train[top_xgb_features]
X_test = X_test[top_xgb_features]

# --------------------------------------------------
# 14. CATBOOST FEATURE IMPORTANCE
# --------------------------------------------------
print("\nRunning CatBoost for feature importance...")
cat = CatBoostRegressor(iterations=500, depth=6, learning_rate=0.1, verbose=0, random_state=42)
cat.fit(X_train, Y_train)
cat_importances = pd.Series(cat.get_feature_importance(), index=X_train.columns)
top_cat_features = cat_importances[cat_importances > np.mean(cat_importances)].index.tolist()
if len(top_cat_features) == 0:
    top_cat_features = X_train.columns.tolist()
print("Top Features after CatBoost:", top_cat_features)

X_train = X_train[top_cat_features]
X_test = X_test[top_cat_features]

# --------------------------------------------------
# 15. FINAL MODEL TRAINING
# --------------------------------------------------
print("\nTraining final Linear Regression model on selected features...")
final_lr = Pipeline([
    ("preprocess", preprocess),
    ("model", LinearRegression())
])
final_lr.fit(X_train, Y_train)
Y_pred = final_lr.predict(X_test)

# --------------------------------------------------
# 16. REVERSE LOG TRANSFORMATION (IF APPLIED)
# --------------------------------------------------
if log_flag:
    Y_test_real = np.expm1(Y_test)
    Y_pred_real = np.expm1(Y_pred)
else:
    Y_test_real = Y_test
    Y_pred_real = Y_pred

# --------------------------------------------------
# 17. FINAL RESULTS
# --------------------------------------------------
final_mse = mean_squared_error(Y_test_real, Y_pred_real)
final_r2 = r2_score(Y_test_real, Y_pred_real)
accuracy = max(0, min(1, final_r2)) * 100

print("\n--------------------------")
print(" FINAL MODEL RESULTS")
print("--------------------------")
print("Features Used:", list(X_train.columns))
print("MSE:", final_mse)
print("R2 :", final_r2)
print("--------------------------")


Upload ZIP or Excel file…


Saving u-6  add logtransformatation&lasoregrassion,decisiontree,xgboost,randomforestalgorithm,deselect feature required,D-tree -OP.zip to u-6  add logtransformatation&lasoregrassion,decisiontree,xgboost,randomforestalgorithm,deselect feature required,D-tree -OP (2).zip
Uploaded: u-6  add logtransformatation&lasoregrassion,decisiontree,xgboost,randomforestalgorithm,deselect feature required,D-tree -OP (2).zip
 Extracting ZIP…

Excel files found: ['/content/extracted_excel/cleaned_tweets_premier_league_footballers_final (2) (1) (2) (3).xlsx', '/content/extracted_excel/cleaned_player_market_value (1) (1) (2) (1) (2) (5).xlsx', '/content/extracted_excel/player_performances_cleaned_partial (1) (1) (1) (2) (3).xlsx', '/content/extracted_excel/player_injuries_cleaned_final (2) (1) (1) (2) (3).xlsx', '/content/extracted_excel/cleaned_player_profiles (2) (1) (1) (1) (2) (3).xlsx']


Dropdown(description='Choose Excel File:', layout=Layout(width='600px'), options=('/content/extracted_excel/cl…

Select the file above, then click RUN AGAIN to continue.

Using Excel file: /content/extracted_excel/cleaned_tweets_premier_league_footballers_final (2) (1) (2) (3).xlsx

Column Names: ['player_name', 'text', 'vader_polarity', 'vader_emotion', 'tb_polarity', 'tb_emotion', 'game_date', 'tweet_date', 'when']
Auto-selected Target Column (Y): vader_polarity
Numeric Features: ['tb_polarity']

Running LassoCV for feature selection...
Selected Features after Lasso: ['tb_polarity']

Running Random Forest for feature importance...
Top Features after RF: ['tb_polarity']

Running XGBoost for feature importance...
Top Features after XGBoost: ['tb_polarity']

Running CatBoost for feature importance...
Top Features after CatBoost: ['tb_polarity']

Training final Linear Regression model on selected features...

--------------------------
 FINAL MODEL RESULTS
--------------------------
Features Used: ['tb_polarity']
MSE: 0.10379517210839248
R2 : 0.1891995592576715
--------------------------


In [10]:
# ==================================================
# 1. IMPORTS
# ==================================================
import pandas as pd
import numpy as np
import zipfile, os, shutil
from google.colab import files
import ipywidgets as widgets
from IPython.display import display

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings("ignore")

# XGBoost & CatBoost
try:
    from xgboost import XGBRegressor
except:
    !pip install xgboost
    from xgboost import XGBRegressor

try:
    from catboost import CatBoostRegressor
except:
    !pip install catboost
    from catboost import CatBoostRegressor

# ==================================================
# 2. FILE UPLOAD
# ==================================================
print("Upload ZIP or Excel file…")
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

extract_dir = "/content/extracted_excel"
if os.path.exists(extract_dir):
    shutil.rmtree(extract_dir)
os.makedirs(extract_dir, exist_ok=True)

excel_files = []

if file_name.endswith(".zip"):
    with zipfile.ZipFile(file_name, 'r') as z:
        z.extractall(extract_dir)
    for root, _, files in os.walk(extract_dir):
        for f in files:
            if f.endswith((".xlsx", ".xls")):
                excel_files.append(os.path.join(root, f))
else:
    new_path = os.path.join(extract_dir, file_name)
    shutil.move(file_name, new_path)
    excel_files.append(new_path)

dropdown = widgets.Dropdown(options=excel_files, description="Choose File:")
display(dropdown)
print(" Select file and RUN AGAIN")

# ==================================================
# 3. LOAD DATA
# ==================================================
df = pd.read_excel(dropdown.value)
print("Columns:", list(df.columns))

# ==================================================
# 4. TARGET DETECTION
# ==================================================
preferred_targets = ["value", "market_value", "price", "mv", "y"]
y_col = None
for c in df.columns:
    if c.lower() in preferred_targets:
        y_col = c
        break

if y_col is None:
    y_col = df.select_dtypes(include=np.number).columns[0]

print("Target:", y_col)

# ==================================================
# 5. FEATURE CLEANING
# ==================================================
X = df.drop(columns=[y_col])
y = pd.to_numeric(df[y_col], errors="coerce")
df = df.dropna(subset=[y_col])

# Convert date columns
for c in X.columns:
    if "date" in c.lower():
        X[c] = pd.to_datetime(X[c], errors="coerce")
        X[c] = (X[c] - X[c].min()).dt.days

X = X.select_dtypes(include=["int64", "float64"])
if X.shape[1] == 0:
    raise ValueError("No numeric features")

# ==================================================
# 6. LOG TRANSFORM TARGET
# ==================================================
log_flag = False
if abs(y.skew()) > 1:
    y = np.log1p(y)
    log_flag = True

# ==================================================
# 7. TRAIN TEST SPLIT
# ==================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

prep = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

X_train = pd.DataFrame(prep.fit_transform(X_train), columns=X.columns)
X_test = pd.DataFrame(prep.transform(X_test), columns=X.columns)

# ==================================================
# 8. FEATURE SELECTION (RF → XGB → CAT)
# ==================================================
def top_features(model, X, y):
    model.fit(X, y)
    return pd.Series(model.feature_importances_, index=X.columns)

rf = RandomForestRegressor(
    n_estimators=500, max_depth=12, random_state=42
)
xgb = XGBRegressor(
    n_estimators=500, max_depth=8, learning_rate=0.05, subsample=0.8, verbosity=0
)
cat = CatBoostRegressor(
    iterations=600, depth=8, learning_rate=0.05, verbose=0
)

rf_f = top_features(rf, X_train, y_train)
xgb_f = top_features(xgb, X_train, y_train)
cat_f = pd.Series(cat.fit(X_train, y_train)
                  .get_feature_importance(), index=X_train.columns)

rf_sel = set(rf_f[rf_f > rf_f.mean()].index)
xgb_sel = set(xgb_f[xgb_f > xgb_f.mean()].index)
cat_sel = set(cat_f[cat_f > cat_f.mean()].index)

final_features = list(rf_sel & xgb_sel & cat_sel)
if len(final_features) < 2:
    final_features = list(rf_sel | xgb_sel | cat_sel)

print("Final Selected Features:", final_features)

X_train = X_train[final_features]
X_test = X_test[final_features]

# ==================================================
# 9. STACKING MODEL
# ==================================================
estimators = [
    ("rf", RandomForestRegressor(n_estimators=400, max_depth=10)),
    ("xgb", XGBRegressor(n_estimators=400, max_depth=8, learning_rate=0.05, verbosity=0)),
    ("cat", CatBoostRegressor(iterations=400, depth=8, verbose=0))
]

stack = StackingRegressor(
    estimators=estimators,
    final_estimator=LinearRegression(),
    passthrough=True
)

stack.fit(X_train, y_train)
pred = stack.predict(X_test)

# ==================================================
# 10. REVERSE LOG
# ==================================================
if log_flag:
    y_test = np.expm1(y_test)
    pred = np.expm1(pred)

# ==================================================
# 11. FINAL METRICS
# ==================================================
r2 = r2_score(y_test, pred)
mse = mean_squared_error(y_test, pred)

print("\n==============================")
print(" FINAL STACKING MODEL RESULT")
print("==============================")
print("R2 SCORE:", round(r2, 4))
print("MSE:", round(mse, 4))
print("Accuracy % (R2):", round(max(0, r2)*100, 2))
print("==============================")


Upload ZIP or Excel file…


Saving u-6  add logtransformatation&lasoregrassion,decisiontree,xgboost,randomforestalgorithm,deselect feature required,D-tree -OP.zip to u-6  add logtransformatation&lasoregrassion,decisiontree,xgboost,randomforestalgorithm,deselect feature required,D-tree -OP (6).zip


Dropdown(description='Choose File:', options=('/content/extracted_excel/cleaned_tweets_premier_league_football…

 Select file and RUN AGAIN
Columns: ['player_name', 'text', 'vader_polarity', 'vader_emotion', 'tb_polarity', 'tb_emotion', 'game_date', 'tweet_date', 'when']
Target: vader_polarity
Final Selected Features: ['tb_polarity']

 FINAL STACKING MODEL RESULT
R2 SCORE: 0.2513
MSE: 0.0958
Accuracy % (R2): 25.13


In [13]:
# ==================================================
# 1. IMPORTS
# ==================================================
import pandas as pd
import numpy as np
import zipfile, os, shutil
from google.colab import files
import ipywidgets as widgets
from IPython.display import display

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore")

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# ==================================================
# 2. FILE UPLOAD
# ==================================================
print("Upload ZIP or Excel file…")
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

extract_dir = "/content/extracted_excel"
if os.path.exists(extract_dir):
    shutil.rmtree(extract_dir)
os.makedirs(extract_dir, exist_ok=True)

excel_files = []

if file_name.endswith(".zip"):
    with zipfile.ZipFile(file_name, 'r') as z:
        z.extractall(extract_dir)
    for root, _, files in os.walk(extract_dir):
        for f in files:
            if f.endswith((".xlsx", ".xls")):
                excel_files.append(os.path.join(root, f))
else:
    new_path = os.path.join(extract_dir, file_name)
    shutil.move(file_name, new_path)
    excel_files.append(new_path)

dropdown = widgets.Dropdown(options=excel_files, description="Choose File:")
display(dropdown)
print(" Select file and RUN AGAIN")

# ==================================================
# 3. LOAD DATA
# ==================================================
df = pd.read_excel(dropdown.value)
print("Columns:", list(df.columns))

# ==================================================
# 4. TARGET DETECTION
# ==================================================
preferred_targets = ["vader_polarity", "polarity", "sentiment", "y"]
y_col = next((c for c in df.columns if c.lower() in preferred_targets),
             df.select_dtypes(include=np.number).columns[0])

print("Target column:", y_col)

# ==================================================
# 5. SENTIMENT CLASS CREATION
# ==================================================
y_raw = pd.to_numeric(df[y_col], errors="coerce")

def sentiment_class(v):
    if v <= -0.05:
        return 0  # Negative
    elif v >= 0.05:
        return 2  # Positive
    else:
        return 1  # Neutral

y = y_raw.apply(sentiment_class)

# ==================================================
# 6. FEATURE CLEANING
# ==================================================
X = df.drop(columns=[y_col], errors="ignore")

for c in X.columns:
    if "date" in c.lower():
        X[c] = pd.to_datetime(X[c], errors="coerce")
        X[c] = (X[c] - X[c].min()).dt.days

X = X.select_dtypes(include=["int64", "float64"])

if X.shape[1] == 0:
    raise ValueError("No numeric features found")

# ==================================================
# 7. SPLIT + SCALE
# ==================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

prep = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

X_train = pd.DataFrame(prep.fit_transform(X_train), columns=X.columns)
X_test = pd.DataFrame(prep.transform(X_test), columns=X.columns)

# ==================================================
# 8. RANDOM FOREST FEATURE SELECTION (SAFE)
# ==================================================
rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)

rf_imp = pd.Series(rf.feature_importances_, index=X_train.columns)
rf_features = rf_imp[rf_imp > rf_imp.mean()].index.tolist()

if len(rf_features) == 0:
    rf_features = rf_imp.sort_values(ascending=False).head(3).index.tolist()

X_train = X_train[rf_features]
X_test = X_test[rf_features]
print("After RF:", len(rf_features), "features")

# ==================================================
# 9. XGBOOST FEATURE SELECTION (SAFE)
# ==================================================
xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="mlogloss"
)
xgb.fit(X_train, y_train)

xgb_imp = pd.Series(xgb.feature_importances_, index=X_train.columns)
xgb_features = xgb_imp[xgb_imp > xgb_imp.mean()].index.tolist()

if len(xgb_features) == 0:
    xgb_features = xgb_imp.sort_values(ascending=False).head(2).index.tolist()

X_train = X_train[xgb_features]
X_test = X_test[xgb_features]
print("After XGBoost:", len(xgb_features), "features")

# ==================================================
# 10. CATBOOST FEATURE SELECTION (SAFE)
# ==================================================
cat = CatBoostClassifier(iterations=400, depth=6, learning_rate=0.05, verbose=0)
cat.fit(X_train, y_train)

cat_imp = pd.Series(cat.get_feature_importance(), index=X_train.columns)
cat_features = cat_imp[cat_imp > cat_imp.mean()].index.tolist()

if len(cat_features) == 0:
    cat_features = cat_imp.sort_values(ascending=False).head(1).index.tolist()
    print(" No feature passed threshold — keeping top feature")

X_train = X_train[cat_features]
X_test = X_test[cat_features]
print("After CatBoost:", len(cat_features), "features")

# ==================================================
# 11. FINAL MODEL (CATBOOST)
# ==================================================
final_model = CatBoostClassifier(
    iterations=600,
    depth=8,
    learning_rate=0.05,
    verbose=0
)

final_model.fit(X_train, y_train)
preds = final_model.predict(X_test)

# ==================================================
# 12. FINAL RESULTS
# ==================================================
acc = accuracy_score(y_test, preds)

print("\n==============================")
print("FINAL ACCURACY:", round(acc * 100, 2), "%")
print("==============================")
print(classification_report(
    y_test, preds,
    target_names=["Negative", "Neutral", "Positive"]
))


Upload ZIP or Excel file…


Saving u-6  add logtransformatation&lasoregrassion,decisiontree,xgboost,randomforestalgorithm,deselect feature required,D-tree -OP.zip to u-6  add logtransformatation&lasoregrassion,decisiontree,xgboost,randomforestalgorithm,deselect feature required,D-tree -OP (8).zip


Dropdown(description='Choose File:', options=('/content/extracted_excel/cleaned_tweets_premier_league_football…

 Select file and RUN AGAIN
Columns: ['player_name', 'text', 'vader_polarity', 'vader_emotion', 'tb_polarity', 'tb_emotion', 'game_date', 'tweet_date', 'when']
Target column: vader_polarity
After RF: 1 features
After XGBoost: 1 features
 No feature passed threshold — keeping top feature
After CatBoost: 1 features

FINAL ACCURACY: 64.03 %
              precision    recall  f1-score   support

    Negative       0.45      0.18      0.26      4448
     Neutral       0.61      0.81      0.70     12127
    Positive       0.71      0.64      0.67     12770

    accuracy                           0.64     29345
   macro avg       0.59      0.54      0.54     29345
weighted avg       0.63      0.64      0.62     29345

