In [None]:
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import os

# load environment variables from .env file for project
dotenv_path = Path('../.env')
load_dotenv(dotenv_path=dotenv_path)

data_directory = os.getenv("OUTPUT_DIRECTORY")
feature_directory = os.path.join(data_directory, "features")

In [None]:
feature_df = pd.read_parquet(os.path.join(feature_directory, "ssi_lidl_spacy_nl_md_features.parquet"), engine="pyarrow")
feature_df = feature_df.rename(columns={"month": "year_month"})
feature_df.head()

In [None]:
centroids_df = feature_df.groupby(by="coicop_division")["features_spacy_nl_md"].mean()
centroids_df = centroids_df.rename("euclidean_centroid")
centroids_df

In [None]:
feature_df = feature_df.merge(centroids_df, on="coicop_division")
feature_df.head()

In [None]:
from scipy.spatial.distance import euclidean


feature_df["centroid_distances"] = feature_df.apply(lambda row: euclidean(row["features_spacy_nl_md"], row["euclidean_centroid"]), axis=1)
feature_df

In [None]:
import matplotlib.pyplot as plt

feature_df.boxplot(column="centroid_distances", by="coicop_division", figsize=(10,10))

In [None]:
for coicop_division in feature_df.coicop_division.unique():
    feature_df[feature_df.coicop_division == coicop_division]["centroid_distances"].plot.bar(figsize=(10,10))

In [None]:
# Get outliers for each coicop_division group, i.e. outside of IQR (inter-quartile range)
from scipy.stats import iqr 

centroid_iqr = feature_df.groupby(by="coicop_division")["centroid_distances"]
q1 = centroid_iqr.quantile(0.25).rename("q1")
q3 = centroid_iqr.quantile(0.75).rename("q3")

feature_df = feature_df.merge(q1, on="coicop_division")
feature_df = feature_df.merge(q3, on="coicop_division")
feature_df["iqr"] = feature_df.q3 - feature_df.q1
feature_df

In [None]:
outliers_df = feature_df[(feature_df.centroid_distances < feature_df.q1 - 1.5 * feature_df.iqr) | (feature_df.centroid_distances > feature_df.q3 + 1.5 * feature_df.iqr )]
outliers_df

In [None]:
outliers_df["coicop_division"].value_counts()

In [None]:
inliers_df = feature_df[(feature_df.centroid_distances >= feature_df.q1 - 1.5 * feature_df.iqr) & (feature_df.centroid_distances <= feature_df.q3 + 1.5 * feature_df.iqr )]
inliers_df

In [None]:
inliers_df["coicop_division"].value_counts()

In [None]:
import wordcloud
from IPython.display import SVG, display

all_words = wordcloud.WordCloud()

inlier_words = " ".join(inliers_df[inliers_df.coicop_division == "01"].ean_name)
display(SVG(all_words.generate_from_text(inlier_words).to_svg()))

In [None]:
outlier_words = " ".join(outliers_df[outliers_df.coicop_division == "01"].ean_name)
display(SVG(all_words.generate_from_text(outlier_words).to_svg()))

In [None]:
feature_df.boxplot(column="centroid_distances", by="coicop_group", figsize=(10,10))

In [None]:
centroid_iqr_group = feature_df.groupby(by="coicop_group")["centroid_distances"]
q1 = centroid_iqr_group.quantile(0.25).rename("q1_group")
q3 = centroid_iqr_group.quantile(0.75).rename("q3_group")

feature_df = feature_df.merge(q1, on="coicop_group")
feature_df = feature_df.merge(q3, on="coicop_group")
feature_df["iqr_group"] = feature_df.q3_group - feature_df.q1_group
feature_df

In [None]:
outliers_group_df = feature_df[(feature_df.centroid_distances < feature_df.q1_group - 1.5 * feature_df.iqr_group) | (feature_df.centroid_distances > feature_df.q3_group + 1.5 * feature_df.iqr_group )]
inliers_group_df = feature_df[(feature_df.centroid_distances >= feature_df.q1_group - 1.5 * feature_df.iqr_group) & (feature_df.centroid_distances <= feature_df.q3_group + 1.5 * feature_df.iqr_group )]

inliers_group_df

In [None]:
inlier_words = " ".join(inliers_group_df[inliers_group_df.coicop_group == "012"].ean_name)
display(SVG(all_words.generate_from_text(inlier_words).to_svg()))

In [None]:
outlier_words = " ".join(outliers_group_df[outliers_group_df.coicop_group == "012"].ean_name)
display(SVG(all_words.generate_from_text(outlier_words).to_svg()))

In [None]:
from sklearn.decomposition import PCA

pca = PCA()

pca.fit(feature_df.features_spacy_nl_md.values.tolist())
print(pca.explained_variance_ratio_)

In [None]:
plt.plot(pca.explained_variance_ratio_)
x = 12
plt.plot([x, x], [0, 0.15])

In [None]:
diff = pca.explained_variance_ratio_[:-1] - pca.explained_variance_ratio_[1:]
plt.plot(diff)

In [None]:
# Cut-off point
len(diff[diff > 1e-3]), diff[diff > 1e-3]

In [None]:
pca = PCA(n_components=12)
features_pca = pca.fit_transform(feature_df.features_spacy_nl_md.values.tolist())

In [None]:
features_pca

In [None]:
feature_df["features_pca"] = features_pca.tolist() 

In [None]:
len(feature_df)

In [None]:
feature_df

In [None]:
from sklearn.model_selection import train_test_split

train_val_data, test_data = train_test_split(feature_df, test_size=0.2, stratify=feature_df.coicop_division)
train_val_data.shape, test_data.shape

In [None]:
from sklearn.linear_model import LogisticRegression

logistic_regression = LogisticRegression()

lr_model = logistic_regression.fit(train_val_data.features_pca.values.tolist(), train_val_data.coicop_division.values.tolist())

In [None]:
from sklearn.metrics import classification_report

y_pred = lr_model.predict(test_data.features_pca.values.tolist())

print(classification_report(test_data.coicop_division.values, y_pred))

In [None]:
from sklearn.linear_model import LogisticRegression

lr_features = LogisticRegression()

lr_model = lr_features.fit(train_val_data.features_spacy_nl_md.values.tolist(), train_val_data.coicop_division.values.tolist())

In [None]:
from sklearn.metrics import classification_report

y_pred = lr_model.predict(test_data.features_spacy_nl_md.values.tolist())

print(classification_report(test_data.coicop_division.values, y_pred))