In [None]:
from snowflake.snowpark import Session
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
from snowflake.snowpark import functions as F
from snowflake.snowpark.types import DoubleType
from dotenv import load_dotenv
from sklearn.feature_extraction import text as txt
from sklearn import svm
from joblib import dump

In [None]:
load_dotenv()

In [None]:
!pip install scikit-learn

In [None]:
pars=SnowflakeLoginOptions("test_conn")

In [None]:
pars

In [None]:
pars['database']='imdb'
pars['schema']='public'

In [None]:
session=Session.builder.configs(pars).create()

In [None]:
session

In [None]:
session.query_tag='sentiment-1'

In [None]:
df=session.table("train_dataset")
df.show()

In [None]:
df.columns

In [None]:
df_flag=df.withColumn(
    "SENTIMENT_FLAG",
    F.when(df['SENTIMENT']=='positive',1).otherwise(value=2)
)

df_flag.show()

In [None]:
train_x=df_flag.toPandas()['REVIEW'].values
train_y=df_flag.toPandas()['SENTIMENT_FLAG'].values

In [None]:
vector=txt.CountVectorizer(
    token_pattern="[\\w]+\\w\\b",
    ngram_range=(1,2),
    analyzer='word',
    max_df=0.02,
    min_df=1*1./len(train_x),
    vocabulary=None,
    binary=True
)

In [None]:
bow=vector.fit_transform(train_x)
dump(vector,'vect_review1.joblib',compress=True)

In [None]:
train_x.shape

In [None]:
model=svm.LinearSVC(C=1.8,max_iter=1000)

In [None]:
model.fit(bow,train_y)

In [None]:
dump(model,"model_review1.joblib",compress=True)

In [None]:
# R2 Score:
model.score(bow,train_y)

In [None]:
# Isolate all code into a Python function
def train_imdb(session:Session, train_dataset_name:str):
    from snowflake.snowpark import Session
    from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
    from snowflake.snowpark import functions as F
    from snowflake.snowpark.types import DoubleType
    from dotenv import load_dotenv
    from sklearn.feature_extraction import text as txt
    from sklearn import svm
    from joblib import dump


    df=session.table(train_dataset_name)
    df_flag=df.withColumn(
        "SENTIMENT_FLAG",
        F.when(df['SENTIMENT']=='positive',1).otherwise(value=2)
    )
    train_x=df_flag.toPandas()['REVIEW'].values
    train_y=df_flag.toPandas()['SENTIMENT_FLAG'].values
    
    vector=txt.CountVectorizer(
        token_pattern="[\\w]+\\w\\b",
        ngram_range=(1,2),
        analyzer='word',
        max_df=0.02,
        min_df=1*1./len(train_x),
        vocabulary=None,
        binary=True
    )
    
    bow=vector.fit_transform(train_x)
    local_bow_name='vect_review1.joblib'
    dump(vector,local_bow_name,compress=True)
    session.file.put(
            local_file_name=local_bow_name,
            stage_location="@models",
            auto_compress=True,
            overwrite=True
    )

    model=svm.LinearSVC(C=1.8,max_iter=1000)
    
    model.fit(bow,train_y)
    
    local_model_name='model_review1.joblib'
    dump(model,local_model_name,compress=True)
    session.file.put(
        local_file_name=local_model_name,
        stage_location="@models",
        auto_compress=True,
        overwrite=True
    )
    
    return {'Status':'Success','R2 Score Train': model.score(bow,train_y)}

In [None]:
ret=train_imdb(session=session,train_dataset_name="train_dataset")

In [None]:
ret

In [None]:
session.sql("LS @models").show()

In [None]:
pars={
    "field_delimiter":",",
    "field_optionally_enclosed_by":'"',
    "infer_schema":True,
    "parse_header":True
}

In [None]:
df=session.read.options(pars).csv("@EXT_STAGE_LIST")
df.show()

In [None]:
df.describe().show()

In [None]:
df.with_column(
        "CUT",
        F.upper(
            F.regexp_replace(
                subject=F.col("CUT"),
                pattern="[^a-zA-Z0-9]+",
                replacement="_"  
            )
        )
    )
df.show()

In [None]:
list(df.schema)

In [None]:
df.columns

In [None]:
for colname in ['carat',"X","Y","Z","DEPTH","TABLE_PCT"]:
    df=df.with_column(
        colname,
        df[colname].cast(DoubleType())
    )