# ML/AI Python Syntax

### Insert Image

In [None]:
from Ipython.display import Image
Image("Desktop\AI.jpg")

[Markdown Syntax for Jupyter Notebook](https://www.google.com/search?sca_esv=bf3933bb65ad3238&rlz=1C1VDKB_enIN1116IN1116&sxsrf=ADLYWIJi5feXrQaLkoCEOl4naiF2QKwgTg:1731563028360&q=how+to+write+text+in+jupyter+notebook&udm=7&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWwAFG7ranuZ26H8lR7pf_8AzBs6lnFFuPH6eU3OV27QKh6ftn9lc4yAcaBgSvqjbS08AwYK5ArknIAZUHpOTMwOfbDzKV1Lg2Z-eWD1L9zdnHdJ4sj2t_Q5kwi_HNvC8EgEhQLOg6AHB7Se-IQZO7CfiZgJ_RkuJCJFoIxEJeBEul6X5BiaUsuxkiVfXWoRDLX9ufwA&sa=X&ved=2ahUKEwjj-9f5jtuJAxVV-zgGHcLgJbcQtKgLegQIEhAB&biw=1280&bih=631&dpr=1.5#fpstate=ive&vld=cid:ccdf81a7,vid:mTIifW_LU5s,st:0)

#### Uploading Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import preprocessing, model_selection, metrics, feature_selection

from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn import neighbors, linear_model, svm, tree, ensemble
from wordcloud import WordCloud, STOPWORDS
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA

##### Importing the file

In [None]:
df = pd.read_csv("laptop_data.csv")
df.info()
df.describe()

### Gives you a breakdown of your data

In [None]:
# Give some info on columns types and number of null values
tab_info = pd.DataFrame(df_initial.dtypes).T.rename(index={0: 'column type'})

# Use pd.concat() instead of append()
tab_info = pd.concat([tab_info, 
                      pd.DataFrame(df_initial.isnull().sum()).T.rename(index={0: 'null values (nb)'}),
                      pd.DataFrame(df_initial.isnull().sum() / df_initial.shape[0] * 100).T.rename(index={0: 'null values (%)'})])

In [None]:
pd.DataFrame([{
    'products': df_initial['StockCode'].nunique(),
    'transactions': df_initial['InvoiceNo'].nunique(),
    'customers': df_initial['CustomerID'].nunique()
}], columns=['products', 'transactions', 'customers'], index=['quantity'])

In [None]:
# This line of code removes any rows from df_initial where the CustomerID column has missing (NaN) values:
df_initial.dropna(axis=0, subset=['CustomerID'], inplace=True)
#1) dropna(axis=0): Drops rows (axis=0) with missing values.
#2) subset=['CustomerID']: Specifies that only rows with missing values in the CustomerID column should be dropped.
#3) inplace=True: Modifies df_initial directly rather than creating a new DataFrame.

### Create a list of numeric and categorical columns  separately

In [None]:
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_df = df[numeric_columns]

categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_df = df[categorical_columns]

## Converting the Target column from string to numeric

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])

In [None]:
df.duplicated().sum()
df.isnull().sum()

# remove duplicates
df = df.drop_duplicates(keep='first')

In [13]:
#converting to date time
df_initial['InvoiceDate'] = pd.to_datetime(df_initial['InvoiceDate'])
# renaming the cols
df.rename(columns={'v1':'target','v2':'text'},inplace=True)

## Binning the Values

In [None]:
#pd.cut() function can be used to categorize the Basket Price values into discrete bins based on the price_range interval
import pandas as pd

# Define price ranges
price_range = [0, 50, 100, 200, 500, 1000, 5000, 50000]

# Use pd.cut to categorize Basket Price into bins
basket_price['price_category'] = pd.cut(basket_price['Basket Price'], bins=price_range)

# Count the number of values in each bin
count_price = basket_price['price_category'].value_counts().sort_index()

# Convert to a list if needed
count_price = count_price.tolist()

print(count_price)

### Manual Function to do it

In [None]:
price_range = [0, 50, 100, 200, 500, 1000, 5000, 50000]
count_price = []
for i, price in enumerate(price_range):
    if i == 0: continue
    val = basket_price[(basket_price['Basket Price'] < price) &
                       (basket_price['Basket Price'] > price_range[i-1])]['Basket Price'].count()
    count_price.append(val)

In [15]:
# Assuming you have certain values like - and special charecters in your data you want to replace with random values in the same column 
# Values to omit
import numpy as np

# Define the unsupported values
value_omitted = ('–', 'not supported')

# Get the valid values for fuel type
value_req = df[~df['fuel_type'].isin(value_omitted)]['fuel_type']

# Replace unsupported values with random choices from the valid values
df['fuel_type'] = df['fuel_type'].apply(lambda x: np.random.choice(value_req) if x in value_omitted else x)


In [17]:
# you want tot remove certain charecters from your column 
df['Ram'] = df['Ram'].str.replace('GB','')
df['Ram'] = df['Ram'].astype('int32')

In [19]:
# Do a quick plot
sns.barplot(x=df['Company'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()

#Do a frequency Plot 
df['Company'].value_counts().plot(kind='bar')

#Scatter Plot
sns.scatterplot(x=df['Weight'],y=df['Price'])

#Pie Chart
import matplotlib.pyplot as plt
plt.pie(df['target'].value_counts(), labels=['ham','spam'],autopct="%0.2f")
plt.show()

#PairPlot
sns.pairplot(df,hue='target')

#HeatMap
sns.heatmap(df.corr(),annot=True)

In [23]:
# Assume you want to convert a category column to a binary column based on a condition if the column value contains a particular string
df['Touchscreen']=df['ScreenResolution'].apply( lambda x:1 if 'Touchscreen' in x else 0)

#Assume you have column 'ScreenResolution' that has values like 'IPS Panel Touchscreen 1920x1200', you want 
# str.split('x', n=1) splits each string in the ScreenResolution column at the first occurrence of 'x'
#.expand=True: Expands the split components into separate columns
new=df['ScreenResolution'].str.split('x',n=1,expand=True)

In [None]:
#To drop any column
df.drop(columns=['Resolution'], inplace=True)

#Lets say you have column called 'Cpu' that has values like "Intel Core i7 8550U","AMD Ryzen 5 3500U" and you want to extract only the first three words
df['Cpu_Model'] = df['Cpu'].apply(lambda x: " ".join(x.split()[0:3]))
#1) x.split(): Splits the string x (each value in the Cpu column) by whitespace, creating a list of words.
#2) [0:3]: Takes the first three elements from this list (i.e., the first three words).
#3) " ".join(...): Joins these first three words back into a single string with spaces in between.

Without performing hyperparameter tuning

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb = XGBClassifier(n_estimators=50,random_state=2)

clfs = {
    'SVC' : svc,
    'KN' : knc, 
    'NB': mnb, 
    'DT': dtc, 
    'LR': lrc, 
    'RF': rfc, 
    'AdaBoost': abc, 
    'BgC': bc, 
    'ETC': etc,
    'GBDT':gbdt,
    'xgb':xgb
}

def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    
    return accuracy,precision

# for seeing on a single use 
classifiertrain_classifier(svc,X_train,y_train,X_test,y_test)


# For seeing the model performance across all the models
accuracy_scores = []
precision_scores = []

for name,clf in clfs.items():
    
    current_accuracy,current_precision = train_classifier(clf, X_train,y_train,X_test,y_test)
    
    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

#view it on a dataframe
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False

## Pipeline preprocessing and Fit

In [None]:
#Linear Rigression


company_order = ['Apple', 'HP', 'Dell', 'Lenovo', 'Acer']
cpu_brand_order = ['Intel Core i7', 'Intel Core i5' , 'Intel Core i3', 'AMD Processor', 'Other Intel Processor']
gpu_brand_order = ['Intel', 'Nvidia', 'AMD']
os_order = ['Mac', 'Windows', 'Linux', 'Others']

# Step 1: Column Transformer with both OneHotEncoder and OrdinalEncoder, specifying categories order
step1 = ColumnTransformer(transformers=[
    ('ord_enc', OrdinalEncoder(categories=[cpu_brand_order, gpu_brand_order, os_order]), 
     ['CPU_Brand', 'Gpu_brand', 'os']),  # Ordinal Encoding with custom order with the required column names
    ('ohe', OneHotEncoder(drop='first'), ['TypeName','Company'])  # OneHotEncoding for 'TypeName'
], remainder='passthrough')  # Pass through the rest of the columns (which are numerical)

# Step 2: Polynomial Features transformation
step2 = PolynomialFeatures(degree=2, include_bias=True)  # Creating interaction terms and polynomial terms

# Step 3: RobustScaler for numerical features
step3 = RobustScaler()

# Step 4: Linear Regression Model (for polynomial regression)
step4 = LinearRegression()


# Create the pipeline with polynomial features and linear regression
pipe = Pipeline([
    ('step1', step1),        # Categorical encoding
    ('step2', step2),        # Polynomial transformation
    ('step3', step3),        # Standardization
    ('step4', step4)         # Linear Regression model (for polynomial regression)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))

###  Another key function to learn

## Pre Processing in NLP

In [None]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))
    
            
    return " ".join(y)
    df['transformed_text'] = df['text'].apply(transform_text)

from wordcloud import WordCloud
wc = WordCloud(width=500,height=500,min_font_size=10,background_color='white')
spam_wc = wc.generate(df[df['target'] == 1]['transformed_text'].str.cat(sep=" ")
plt.figure(figsize=(15,6))
plt.imshow(spam_wc)

spam_corpus = []
for msg in df[df['target'] == 1]['transformed_text'].tolist():
    for word in msg.split():
        spam_corpus.append(word)

from collections import Counter
sns.barplot(pd.DataFrame(Counter(spam_corpus).most_common(30))[0],pd.DataFrame(Counter(spam_corpus).most_common(30))[1])
plt.xticks(rotation='vertical')
plt.show()

#Model Building
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['transformed_text']).toarray()
y = df['target'].values


In [None]:
def keywords_inventory(dataframe, colonne = 'Description'):
    stemmer = nltk.stem.SnowballStemmer("english")
    keywords_roots  = dict()  # collect the words / root
    keywords_select = dict()  # association: root <-> keyword
    category_keys   = []
    count_keywords  = dict()
    icount = 0
    for s in dataframe[colonne]:
        if pd.isnull(s): continue
        lines = s.lower()
        tokenized = nltk.word_tokenize(lines)
        nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] 
        
        for t in nouns:
            t = t.lower() ; racine = stemmer.stem(t)
            if racine in keywords_roots:                
                keywords_roots[racine].add(t)
                count_keywords[racine] += 1                
            else:
                keywords_roots[racine] = {t}
                count_keywords[racine] = 1
    
    for s in keywords_roots.keys():
        if len(keywords_roots[s]) > 1:  
            min_length = 1000
            for k in keywords_roots[s]:
                if len(k) < min_length:
                    clef = k ; min_length = len(k)            
            category_keys.append(clef)
            keywords_select[s] = clef
        else:
            category_keys.append(list(keywords_roots[s])[0])
            keywords_select[s] = list(keywords_roots[s])[0]
                   
    print("Nb of keywords in variable '{}': {}".format(colonne,len(category_keys)))
    return category_keys, keywords_roots, keywords_select, count_keywords

## Random data preprocessing function

In [None]:

df_cleaned = df_initial.copy(deep = True)
df_cleaned['QuantityCanceled'] = 0

entry_to_remove = [] ; doubtfull_entry = []

for index, col in  df_initial.iterrows():
    if (col['Quantity'] > 0) or col['Description'] == 'Discount': continue        
    df_test = df_initial[(df_initial['CustomerID'] == col['CustomerID']) &
                         (df_initial['StockCode']  == col['StockCode']) & 
                         (df_initial['InvoiceDate'] < col['InvoiceDate']) & 
                         (df_initial['Quantity']   > 0)].copy()
    #___________________________________
    # Cancelation WITHOUT counterpart
    if (df_test.shape[0] == 0): 
        doubtfull_entry.append(index)
    #________________________________
    # Cancelation WITH a counterpart
    elif (df_test.shape[0] == 1): 
        index_order = df_test.index[0]
        df_cleaned.loc[index_order, 'QuantityCanceled'] = -col['Quantity']
        entry_to_remove.append(index)        
    #______________________________________________________________
    # Various counterparts exist in orders: we delete the last one
    elif (df_test.shape[0] > 1): 
        df_test.sort_index(axis=0 ,ascending=False, inplace = True)        
        for ind, val in df_test.iterrows():
            if val['Quantity'] < -col['Quantity']: continue
            df_cleaned.loc[ind, 'QuantityCanceled'] = -col['Quantity']
            entry_to_remove.append(index) 
            break            