In [2]:
# Code Library/Packages:
from types import ModuleType
import pandas as pd
import numpy as np
import random
import types
import pkg_resources
import gc
import tqdm as tqdm

# Visual Library/Packages:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc
%matplotlib inline

# Sklearn Library/Packages:
from sklearn.model_selection import train_test_split

# NLTK Library/Packages:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Transformers Library/Packages:
import transformers

# Torch Library/Packages:
import torch

# Lightning Library/Packages:
import pytorch_lightning as pl

# Setting of seeds for comparable results for learning models:
SEED_VAL=42
random.seed(SEED_VAL)
np.random.seed(SEED_VAL)
torch.manual_seed(SEED_VAL)
torch.cuda.manual_seed(SEED_VAL)

# Set random seed and random seed for all lightning models:
RANDOM_SEED=42
pl.seed_everything(RANDOM_SEED)

# Processing Device(s):
processing_device= "cuda" if torch.cuda.is_available() else "cpu"
torch_aval = torch.cuda.is_available()

def get_imports():

    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):            
            name = val.__module__.split(".")[0]

        poorly_named_packages = {
            "PIL": "Pillow",
            "sklearn": "scikit-learn"
        }

        if name in poorly_named_packages.keys():
            name = poorly_named_packages[name]

        yield name

imports = list(set(get_imports()))

requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name!="pip":
        requirements.append((m.project_name, m.version))

# Set data paths:
IN=r'E:\random_data\in'
OUT=r'E:\random_data\out'

print("----------------------------------------------------")
print("###               VERSION TYPES                  ###")
print("----------------------------------------------------")
print(f"Versions: ", requirements)
print(f"Device(s) to Utilize: ", processing_device)
print(f"Is Torch Availabke?: ", torch_aval)
print(f"# of Devices Found: ", torch.__version__)
print("----------------------------------------------------")
print(f"If NVIDIA-SMI is not found, then CUDA isn't available on this device:")
!nvidia-smi
gc.collect()
print("----------------------------------------------------")
print(f"Clear Torch Cuda Memory: ", torch.cuda.empty_cache())
print("-----ENVIRONMENT IS COMPLETE & STAGED CORRECTLY-----")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\proto\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\proto\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\proto\AppData\Roaming\nltk_data...
  from .autonotebook import tqdm as notebook_tqdm
  warn(f"Failed to load image Python extension: {e}")
Global seed set to 42


----------------------------------------------------
###               VERSION TYPES                  ###
----------------------------------------------------
Versions:  [('tqdm', '4.64.1'), ('transformers', '4.25.1'), ('nltk', '3.7'), ('matplotlib', '3.6.2'), ('numpy', '1.23.5'), ('torch', '1.13.0'), ('pandas', '1.5.2'), ('seaborn', '0.12.1')]
Device(s) to Utilize:  cpu
Is Torch Availabke?:  False
# of Devices Found:  1.13.0+cpu
----------------------------------------------------
If NVIDIA-SMI is not found, then CUDA isn't available on this device:
Thu Dec  8 17:57:12 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 527.56       Driver Version: 527.56       CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|          

In [3]:
df = pd.read_csv(IN + r'\coffee_maker_ratings.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  4999 non-null   object
 1   target  4999 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 78.2+ KB


In [4]:
df['target'].value_counts()

1    2935
0    2064
Name: target, dtype: int64