In [None]:
dataset_link = "[https://drive.google.com/drive/folders/1NEs0rpFelfzSWAJ6y832EDpW9ImQH4QJ]"


****

**[1.]Where to collect the Data?**

1.Kaggle

2.UCI Machine Learning Repository

3.Google Dataset Search

****

**[2.]Importing Dataset through kaggle API**

In [None]:
#installing the Kaggle libreary
!pip install kaggle

#configuring the path of Kaggle.json file
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

#API to fetch the dataset from kaggle
!kaggle competitions download -c LANL-Earthquake-Prediction

#extracting the compressed Dataset
from zipfile import ZipFile
dataset='/content/LANL-Earthquake-Prediction.zip'
with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

****

**[3.]Handling Missing Values in Machine Learning**

Methods to Handle Missing Values:

1.Imputation

2.Dropping

In [None]:
import pandas as pd

# Create a DataFrame with missing values
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Emily'],
        'Math': [85, 92, None, 78, 88],
        'Science': [90, None, 88, 75, 85]}
df = pd.DataFrame(data)

# Handling missing values by filling them with the mean
df.fillna(df.mean(), inplace=True)
print(df)


      Name   Math  Science
0    Alice  85.00     90.0
1      Bob  92.00     84.5
2  Charlie  85.75     88.0
3    David  78.00     75.0
4    Emily  88.00     85.0


  df.fillna(df.mean(), inplace=True)


In [None]:
import pandas as pd

# Create a DataFrame with missing values
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Emily'],
        'Math': [85, 92, None, 78, 88],
        'Science': [90, None, 88, 75, 85]}
df = pd.DataFrame(data)

# Drop rows with any missing values
df.dropna(inplace=True)
print(df)


    Name  Math  Science
0  Alice  85.0     90.0
3  David  78.0     75.0
4  Emily  88.0     85.0


****

**[4.] Data Standardization**

The process of standardizing the data to a common format and common range

In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Create a DataFrame with the data
data = {
    'Age': [30, 45, 20],
    'Income': [50, 80, 30]
}
df = pd.DataFrame(data)

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform the data
scaled_data = scaler.fit_transform(df)

# Convert the scaled data back to a DataFrame
scaled_df = pd.DataFrame(scaled_data, columns=df.columns)

print("Original Data:")
print(df)
print("\nScaled Data:")
print(scaled_df)


Original Data:
   Age  Income
0   30      50
1   45      80
2   20      30

Scaled Data:
        Age    Income
0 -0.162221 -0.162221
1  1.297771  1.297771
2 -1.135550 -1.135550


****

**[5.]Label Encoding**

converting the labels into numeric form

In [None]:
from sklearn.preprocessing import LabelEncoder

# Create a list of animal labels
animal_labels = ['cat', 'dog', 'rabbit', 'cat', 'dog', 'rabbit']

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the data
encoded_labels = label_encoder.fit_transform(animal_labels)

# Print the encoded labels
print("Encoded Labels:", encoded_labels)

# Print the mapping of original labels to encoded labels
print("Label Mapping:", dict(zip(animal_labels, encoded_labels)))


Encoded Labels: [0 1 2 0 1 2]
Label Mapping: {'cat': 0, 'dog': 1, 'rabbit': 2}



****

**[6.]Train_Test_Split**

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

# Generate sample data
np.random.seed(42)  # For reproducibility
X = np.random.rand(100, 3)  # Features
y = np.random.rand(100)  # Target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the sizes of the training and testing sets
print("Total set size:",len(X))
print("Training set size:", len(X_train))
print("Testing set size:", len(X_test))


Total set size: 100
Training set size: 80
Testing set size: 20


****

**[7.]Imbalaced Dataset**

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml

# Load a dataset from sklearn.datasets
data = fetch_openml(name='credit-g', version=1)
credit_card_data = pd.DataFrame(data.data, columns=data.feature_names)
credit_card_data['Class'] = data.target

# Distribution of the two classes
print("Original class distribution:")
print(credit_card_data['Class'].value_counts())

# Separating the legit and fraudulent transactions
legit = credit_card_data[credit_card_data.Class == 'good']
fraud = credit_card_data[credit_card_data.Class == 'bad']

# Shape of legit and fraud transactions
print("Original shape of legit and fraud transactions:")
print(legit.shape)
print(fraud.shape)

# Under-Sampling
legit_sample = legit.sample(n=300)
new_dataset = pd.concat([legit_sample, fraud], axis=0)
print("Class distribution after under-sampling:")
print(new_dataset['Class'].value_counts())


  warn(


Original class distribution:
good    700
bad     300
Name: Class, dtype: int64
Original shape of legit and fraud transactions:
(700, 21)
(300, 21)
Class distribution after under-sampling:
bad     300
good    300
Name: Class, dtype: int64


***

**[8.]Feature Extraction Of Text Data:Tf-idf Vectorizer**

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the 20 newsgroups dataset
newsgroups_train = fetch_20newsgroups(subset='train')

# Display the categories
print("Categories:", newsgroups_train.target_names)

# Display an example text from the dataset
print("\nExample Text:")
print(newsgroups_train.data[0])

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
X_tfidf = tfidf_vectorizer.fit_transform(newsgroups_train.data)

# Print the shape of the transformed data (number of documents, size of vocabulary)
print("\nShape of transformed data:", X_tfidf.shape)

# Get the feature names
feature_names = tfidf_vectorizer.get_feature_names_out()

# Print the first 10 features (words in the vocabulary)
print("\nFirst 10 features:")
print(feature_names[:10])


Categories: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

Example Text:
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, 

***

**[9.]Numerical Dataset Processing**

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Sample numerical dataset
data = {
    'Age': [25, 30, np.nan, 35, 40],
    'Income': [50000, 60000, 75000, np.nan, 90000],
    'Score': [4.5, 5.2, 3.9, 4.1, np.nan]
}

df = pd.DataFrame(data)

# Define numerical features
numerical_features = ['Age', 'Income', 'Score']

# Create a pipeline for numerical preprocessing
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', StandardScaler())  # Standardize features by removing the mean and scaling to unit variance
])

# Apply numerical preprocessing to numerical features
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features)
])

# Fit and transform the data
preprocessed_data = preprocessor.fit_transform(df)

# Convert preprocessed data back to a DataFrame for visualization
preprocessed_df = pd.DataFrame(preprocessed_data, columns=numerical_features)

print("Original Data:")
print(df)
print("\nPreprocessed Data:")
print(preprocessed_df)


Original Data:
    Age   Income  Score
0  25.0  50000.0    4.5
1  30.0  60000.0    5.2
2   NaN  75000.0    3.9
3  35.0      NaN    4.1
4  40.0  90000.0    NaN

Preprocessed Data:
   Age    Income     Score
0 -1.5 -1.383208  0.168763
1 -0.5 -0.645497  1.743886
2  0.0  0.461069 -1.181342
3  0.5  0.000000 -0.731307
4  1.5  1.567636  0.000000


***

**[10.]Textual Data Processing**

In [7]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [8]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample text data
data = {
    'text': ["I love chatting with ChatGPT!",
             "Natural Language Processing is amazing.",
             "Preprocessing text data can be tricky, but it's essential for ML."]
}

df = pd.DataFrame(data)

# Text preprocessing function
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    # Join tokens back into text
    preprocessed_text = ' '.join(lemmatized_tokens)
    return preprocessed_text

# Apply text preprocessing to the 'text' column
df['preprocessed_text'] = df['text'].apply(preprocess_text)

# Convert preprocessed text into TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['preprocessed_text'])

# Convert TF-IDF matrix into DataFrame for visualization
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print("Original Data:")
print(df[['text']])
print("\nPreprocessed Data:")
print(df[['preprocessed_text']])
print("\nTF-IDF Matrix:")
print(tfidf_df)


Original Data:
                                                text
0                      I love chatting with ChatGPT!
1            Natural Language Processing is amazing.
2  Preprocessing text data can be tricky, but it'...

Preprocessed Data:
                             preprocessed_text
0                        love chatting chatgpt
1          natural language processing amazing
2  preprocessing text data tricky essential ml

TF-IDF Matrix:
   amazing  chatgpt  chatting      data  essential  language     love  \
0      0.0  0.57735   0.57735  0.000000   0.000000       0.0  0.57735   
1      0.5  0.00000   0.00000  0.000000   0.000000       0.5  0.00000   
2      0.0  0.00000   0.00000  0.408248   0.408248       0.0  0.00000   

         ml  natural  preprocessing  processing      text    tricky  
0  0.000000      0.0       0.000000         0.0  0.000000  0.000000  
1  0.000000      0.5       0.000000         0.5  0.000000  0.000000  
2  0.408248      0.0       0.408248         0.