In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('cleaned_file.csv')

# Keep only the first 100 rows
df_truncated = df.head(100)

# Save the truncated dataset back to a CSV file
df_truncated.to_csv('truncated_file.csv', index=False)


In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# Mock dataset (replace with your actual dataset loading)
file_path = '/content/truncated_file.csv'  # Update with the correct path
df = pd.read_csv(file_path)

# Clean and encode dates
def clean_dates(df, date_columns):
    for col in date_columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')
    return df

# Encode categorical columns
def encode_categorical(df, columns):
    for col in columns:
        df[col] = df[col].astype('category').cat.codes
    return df

# Columns to be cleaned and encoded
date_columns = ['first_occurrence_date', 'last_occurrence_date', 'reported_date']
categorical_columns = ['offense_type_id', 'offense_category_id', 'incident_address', 'neighborhood_id']

# Clean and encode data
df_cleaned = clean_dates(df.copy(), date_columns)
df_encoded = encode_categorical(df_cleaned, categorical_columns)

# Combine text fields into a single column
df_encoded['text'] = df_encoded['offense_type_id'].astype(str) + ' ' + df_encoded['offense_category_id'].astype(str) + ' ' + df_encoded['incident_address'].astype(str)

# Split the data into features and target
X = df_encoded['text']
y = df_encoded['is_crime']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to pandas DataFrame
train_df = pd.DataFrame({'text': X_train, 'label': y_train})
test_df = pd.DataFrame({'text': X_test, 'label': y_test})

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

# Load DistilBERT tokenizer and model for sequence classification
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results_distilbert',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs_distilbert',
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
)

# Initialize Trainer for fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()

print(f"Evaluation results: {eval_results}")

# Example query for inference
query = "criminal-mischief-other public-disorder 1107 N SANTA FE DR"
inputs = tokenizer(query, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
outputs = model(**inputs)
predicted_class = outputs.logits.argmax(dim=-1)

print(f"Predicted class: {predicted_class}")


  df[col] = pd.to_datetime(df[col], errors='coerce')
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [2]:
import openai
import pandas as pd
from sklearn.model_selection import train_test_split

# Mock dataset (replace with your actual dataset loading)
file_path = '/content/truncated_file.csv'  # Update with the correct path
df = pd.read_csv(file_path)

# Combine text fields into a single column
df['text'] = df['offense_type_id'].astype(str) + ' ' + df['offense_category_id'].astype(str) + ' ' + df['incident_address'].astype(str)

# Split the data into features and target
X = df['text'].tolist()
y = df['is_crime'].tolist()

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Join text for fine-tuning
train_text = "\n".join(X_train)

In [5]:
api_key = ""  # Replace with your actual OpenAI API key
openai.api_key = api_key
Authorization: Bearer OPENAI_API_KEY
# Fine-tune GPT-3.5-turbo via OpenAI API
response = openai.FineTune.create(
    model="gpt-3.5-turbo",  # Specify the GPT model you want to fine-tune
    data=train_text,
    labels=y_train,
    n_epochs=3,
    batch_size=4,  # Adjust batch size based on API limits and performance
    validation_split=0.1,  # Split a small portion for validation
    save_every=1000,  # Save model checkpoints periodically
)

print(response)

# Example query for inference
query = "criminal-mischief-other public-disorder 1107 N SANTA FE DR"
response = openai.Completion.create(
    model="gpt-3.5.-turbo",  # Specify the fine-tuned model
    prompt=query,
    max_tokens=100,
)

print(f"Generated text: {response['choices'][0]['text'].strip()}")

InvalidRequestError: Unknown request URL: POST /v1/fine-tunes. Please check the URL for typos, or see the docs at https://platform.openai.com/docs/api-reference/.

In [1]:
!pip install openai==0.28



In [None]:
import openai
import pandas as pd
from sklearn.model_selection import train_test_split

# Mock dataset (replace with your actual dataset loading)
data = {
    'first_occurrence_date': ['2021-01-01', '2021-01-02'],
    'last_occurrence_date': ['2021-01-01', '2021-01-02'],
    'reported_date': ['2021-01-01', '2021-01-02'],
    'offense_type_id': ['type1', 'type2'],
    'offense_category_id': ['category1', 'category2'],
    'incident_address': ['address1', 'address2'],
    'neighborhood_id': ['neighborhood1', 'neighborhood2'],
    'is_crime': [1, 0]
}

df = pd.DataFrame(data)

# Clean and encode dates
def clean_dates(df, date_columns):
    for col in date_columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')
    return df

# Encode categorical columns
def encode_categorical(df, columns):
    for col in columns:
        df[col] = df[col].astype('category').cat.codes
    return df

# Columns to be cleaned and encoded
date_columns = ['first_occurrence_date', 'last_occurrence_date', 'reported_date']
categorical_columns = ['offense_type_id', 'offense_category_id', 'incident_address', 'neighborhood_id']

# Clean and encode data
df_cleaned = clean_dates(df.copy(), date_columns)
df_encoded = encode_categorical(df_cleaned, categorical_columns)

# Combine text fields into a single column
df_encoded['text'] = df_encoded['offense_type_id'].astype(str) + ' ' + df_encoded['offense_category_id'].astype(str) + ' ' + df_encoded['incident_address'].astype(str)

# Split the data into features and target
X = df_encoded['text']
y = df_encoded['is_crime']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Join text for fine-tuning
train_text = "\n".join(X_train.tolist())

# Set up OpenAI API key
api_key = "sk-proj-jkAkZChJnMEXdBuywmhFT3BlbkFJzMyWS0OZm0ECi7foN1tV"  # Replace with your actual OpenAI API key
openai.api_key = api_key

# Fine-tune GPT-3.5-turbo via OpenAI API
response = openai.FineTune.create(
    model="gpt-3.5-turbo",
    data=train_text,
    labels=y_train.tolist(),
    n_epochs=3,
    n_examples=len(X_train),
    save_every=1000,  # Save model checkpoints every 1000 steps
)

# Print the response
print(response)

# Example query for inference
query = "criminal-mischief-other public-disorder 1107 N SANTA FE DR"
response = openai.Completion.create(
    model="gpt-3.5-turbo",
    prompt=query,
    max_tokens=100,
)

print(f"Generated text: {response['choices'][0]['text'].strip()}")


In [None]:
df_truncated

  cast_date_col = pd.to_datetime(column, errors="coerce")


Unnamed: 0,incident_id,offense_id,offense_code,offense_code_extension,offense_type_id,offense_category_id,first_occurrence_date,last_occurrence_date,reported_date,incident_address,geo_x,geo_y,geo_lon,geo_lat,district_id,precinct_id,neighborhood_id,is_crime,is_traffic,victim_count
0,202268791,202268791299900,2999,0,criminal-mischief-other,public-disorder,2/10/2022 2:50:00 AM,,2/10/2022 3:16:00 AM,1107 N SANTA FE DR,3140929.0,1692612.0,-104.998910,39.733957,1,123.0,lincoln-park,1.0,0.0,1.0
1,2021387586,2021387586299900,2999,0,criminal-mischief-other,public-disorder,7/7/2021 9:02:00 PM,,7/8/2021 12:55:00 AM,815 16TH ST,3142470.0,1697098.0,-104.993342,39.746248,6,611.0,cbd,1.0,0.0,1.0
2,2020641486,2020641486299900,2999,0,criminal-mischief-other,public-disorder,10/29/2020 1:30:00 AM,,10/29/2020 4:31:00 AM,4745 N FEDERAL BLVD,3133352.0,1710396.0,-105.025520,39.782888,1,111.0,berkeley,1.0,0.0,1.0
3,2018612468,2018612468299900,2999,0,criminal-mischief-other,public-disorder,9/6/2018 5:00:00 PM,9/6/2018 11:00:00 PM,9/7/2018 9:58:00 AM,65 S FEDERAL BLVD,3133534.0,1685797.0,-105.025330,39.715357,4,411.0,barnum,1.0,0.0,1.0
4,2020293614,2020293614299900,2999,0,criminal-mischief-other,public-disorder,5/8/2020 5:00:00 AM,5/8/2020 6:30:00 PM,5/13/2020 10:00:00 AM,12295 E ALBROOK DR,3184065.0,1710782.0,-104.845074,39.783082,5,521.0,montbello,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2019335660,2019335660299900,2999,0,criminal-mischief-other,public-disorder,5/29/2019 7:20:00 PM,,5/29/2019 7:58:00 PM,1295 N YORK ST,3151827.0,1693663.0,-104.960140,39.736670,6,622.0,cheesman-park,1.0,0.0,1.0
96,20206006653,20206006653299900,2999,0,criminal-mischief-other,public-disorder,6/3/2020 1:30:00 AM,6/3/2020 1:35:00 AM,6/3/2020 9:47:00 AM,1655 N GRANT ST,3145105.0,1695865.0,-104.983997,39.742823,6,621.0,north-capitol-hill,1.0,0.0,1.0
97,2018206602,2018206602299900,2999,0,criminal-mischief-other,public-disorder,3/12/2018 8:00:00 AM,3/28/2018 8:00:00 AM,3/28/2018 4:00:00 PM,3284 N NEWTON ST,3130272.0,1703380.0,-105.036606,39.763672,1,113.0,west-highland,1.0,0.0,1.0
98,2021334418,2021334418299900,2999,0,criminal-mischief-other,public-disorder,6/13/2021 2:30:00 AM,,6/13/2021 2:33:00 AM,1331 N SPEER BLVD,3141191.0,1693918.0,-104.997953,39.737538,1,123.0,lincoln-park,1.0,0.0,1.0


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your new dataset
file_path = '/content/truncated_file.csv'
crime_data = pd.read_csv(file_path, encoding='latin1')


In [None]:
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.0
import os
import transformers
import torch
from google.colab import userdata
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, GemmaTokenizer
import os
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.9/150.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.4/102.4 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
import torch

# Load the dataset
file_path = '/content/truncated_file.csv'  # Update with the correct path
df = pd.read_csv(file_path)

# Clean and encode dates (if needed)
def clean_dates(df, date_columns):
    for col in date_columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')
    return df

# Encode categorical columns
def encode_categorical(df, columns):
    le_dict = {}
    for col in columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        le_dict[col] = le
    return df, le_dict

# Columns to be cleaned and encoded
date_columns = ['first_occurrence_date', 'last_occurrence_date', 'reported_date']
categorical_columns = ['offense_type_id', 'offense_category_id', 'incident_address', 'neighborhood_id']

# Clean dates
df_cleaned = clean_dates(df.copy(), date_columns)

# Encode categorical columns
df_encoded, label_encoders = encode_categorical(df_cleaned, categorical_columns)

# Combine text fields into a single column
df_encoded['text'] = df_encoded['offense_type_id'].astype(str) + ' ' + df_encoded['offense_category_id'].astype(str) + ' ' + df_encoded['incident_address'].astype(str)

# Split the data into features and target
X = df_encoded['text']
y = df_encoded['is_crime']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to pandas DataFrame
train_df = pd.DataFrame({'text': X_train, 'label': y_train})
test_df = pd.DataFrame({'text': X_test, 'label': y_test})

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

# Fine-tuning code
class LoRA(torch.nn.Module):
    def __init__(self, model, rank=8, alpha=32):
        super().__init__()
        self.model = model
        self.rank = rank
        self.alpha = alpha
        self.low_rank_matrices = {}
        for name, param in model.named_parameters():
            if "weight" in name and param.ndim == 2:
                self.low_rank_matrices[name] = torch.nn.Parameter(
                    torch.zeros((param.size(0), self.rank))
                )
                torch.nn.init.normal_(self.low_rank_matrices[name], std=1/self.rank)
                self.model.register_parameter(f"{name}_lora", self.low_rank_matrices[name])

    def forward(self, *inputs, **kwargs):
        for name, param in self.model.named_parameters():
            if name in self.low_rank_matrices:
                param.data += self.alpha * torch.mm(
                    self.low_rank_matrices[name], param.data
                )
        return self.model(*inputs, **kwargs)

# Load model
model_name = "bert-base-uncased"  # Replace with the actual GEMMA model name if different
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
lora_model = LoRA(model)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Set up Trainer
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Function to make predictions on a single query
def predict(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return predictions

# Example query
query = "criminal-mischief-other public-disorder 1107 N SANTA FE DR"
predictions = predict(lora_model, tokenizer, query)
predictions


  df[col] = pd.to_datetime(df[col], errors='coerce')


Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyError: 'parameter name can\'t contain "."'

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset, DatasetDict
import torch
import os

# Load the dataset
file_path = '/content/truncated_file.csv'  # Update with the correct path
df = pd.read_csv(file_path)

# Clean and encode dates (if needed)
def clean_dates(df, date_columns):
    for col in date_columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')
    return df

# Encode categorical columns
def encode_categorical(df, columns):
    le_dict = {}
    for col in columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        le_dict[col] = le
    return df, le_dict

# Columns to be cleaned and encoded
date_columns = ['first_occurrence_date', 'last_occurrence_date', 'reported_date']
categorical_columns = ['offense_type_id', 'offense_category_id', 'incident_address', 'neighborhood_id']

# Clean dates
df_cleaned = clean_dates(df.copy(), date_columns)

# Encode categorical columns
df_encoded, label_encoders = encode_categorical(df_cleaned, categorical_columns)

# Combine text fields into a single column
df_encoded['text'] = df_encoded['offense_type_id'].astype(str) + ' ' + df_encoded['offense_category_id'].astype(str) + ' ' + df_encoded['incident_address'].astype(str)

# Split the data into features and target
X = df_encoded['text']
y = df_encoded['is_crime']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to pandas DataFrame
train_df = pd.DataFrame({'text': X_train, 'label': y_train})
test_df = pd.DataFrame({'text': X_test, 'label': y_test})

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('google/gemma-2b')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)


  df[col] = pd.to_datetime(df[col], errors='coerce')


tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [None]:
!pip install openpyxl




In [None]:
import openpyxl

# Load the workbook and select the active sheet
workbook = openpyxl.load_workbook('crime.csv')
sheet = workbook.active

# Iterate through the first 150 columns
for col in range(1, 151):
    for row in range(1, sheet.max_row + 1):
        cell = sheet.cell(row=row, column=col)
        # Perform your operations here
        print(cell.value)

# Save the workbook if any changes were made
# workbook.save('yourfile_modified.xlsx')


InvalidFileException: openpyxl does not support .csv file format, please check you can open it with Excel first. Supported formats are: .xlsx,.xlsm,.xltx,.xltm

In [None]:
!pip install openpyxl

import pandas as pd

# Load the CSV file
df = pd.read_csv('/content/cleaned_file.csv')

# Keep only the first 100 rows
df_truncated = df.head(100)

# Save the truncated dataset to an Excel file
df_truncated.to_excel('truncated_file.xlsx', index=False)

import openpyxl

# Load the Excel file
workbook = openpyxl.load_workbook('truncated_file.xlsx')

# Access the sheet
sheet = workbook.active

# Iterate through the first 150 columns
for col in range(1, 151):
    for row in range(1, sheet.max_row + 1):
        cell = sheet.cell(row=row, column=col)
        # Perform your operations here
        print(cell.value)

# Save the workbook if any changes were made
# workbook.save('yourfile_modified.xlsx')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None


In [None]:
import pandas as pd

# Load the CSV file, skipping bad lines
df = pd.read_csv('crime.csv', on_bad_lines='skip')

# Save the cleaned dataset back to a CSV file
df.to_csv('cleaned_file.csv', index=False)


In [6]:
! pip install transformers trl accelerate torch bitsandbytes peft datasets -qU

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.7/226.7 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.1/779.1 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.1/168.1 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.3.0+cu121 requires torch==2.3.0, but you have torch 2.3.1 which is incompatible.
torchvision 0.18.0+cu121 requires torch==2.3.0, but you have torch 2.3.1 which is incompatible.
xformers 0.0.26.post1 requires torch==2.3.0, but you have torch 2.3.1 which is incompatible.[0m[31m
[0m

In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# Mock dataset (replace with your actual dataset loading)
file_path = '/content/truncated_file.csv'  # Update with the correct path
df = pd.read_csv(file_path)

# Clean and encode dates
def clean_dates(df, date_columns):
    for col in date_columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')
    return df

# Encode categorical columns
def encode_categorical(df, columns):
    for col in columns:
        df[col] = df[col].astype('category').cat.codes
    return df

# Columns to be cleaned and encoded
date_columns = ['first_occurrence_date', 'last_occurrence_date', 'reported_date']
categorical_columns = ['offense_type_id', 'offense_category_id', 'incident_address', 'neighborhood_id']

# Clean and encode data
df_cleaned = clean_dates(df.copy(), date_columns)
df_encoded = encode_categorical(df_cleaned, categorical_columns)

# Combine text fields into a single column
df_encoded['text'] = df_encoded['offense_type_id'].astype(str) + ' ' + df_encoded['offense_category_id'].astype(str) + ' ' + df_encoded['incident_address'].astype(str)

# Split the data into features and target
X = df_encoded['text']
y = df_encoded['is_crime']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to pandas DataFrame
train_df = pd.DataFrame({'text': X_train, 'label': y_train})
test_df = pd.DataFrame({'text': X_test, 'label': y_test})

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})


  df[col] = pd.to_datetime(df[col], errors='coerce')


In [15]:
dataset_dict
train_dataset

Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 80
})

In [10]:
### Instruction:
Use the provided input to create an instruction that could have been used to generate the response with an LLM.

### Input:
{input}

### Response:
{label};


SyntaxError: invalid syntax (<ipython-input-10-91230c9267f2>, line 2)

In [18]:
def create_prompt(sample):
  bos_token = "<s>"
  original_system_message = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
  system_message = "Use the provided input to create an instruction that could have been used to generate the response with an LLM."
  response = sample["text"].replace(original_system_message, "").replace("\n\n### \n", "").replace("\n### label\n", "").strip()
  input = str(sample["label"])
  eos_token = "</s>"

  full_prompt = ""
  full_prompt += bos_token
  full_prompt += "### Instruction:"
  full_prompt += "\n" + system_message
  full_prompt += "\n\n### Input:"
  full_prompt += "\n" + input
  full_prompt += "\n\n### Response:"
  full_prompt += "\n" + label
  full_prompt += eos_token

  return full_prompt

In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U datasets scipy ipywidgets matplotlib

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m39.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hcanceled
[31mERROR: Operation cancelled by user[0m[31m
[0mTraceback (most recent call last):


In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# Mock dataset (replace with your actual dataset loading)
file_path = '/content/truncated_file.csv'  # Update with the correct path
df = pd.read_csv(file_path)

# Clean and encode dates
def clean_dates(df, date_columns):
    for col in date_columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')
    return df

# Encode categorical columns
def encode_categorical(df, columns):
    for col in columns:
        df[col] = df[col].astype('category').cat.codes
    return df

# Columns to be cleaned and encoded
date_columns = ['first_occurrence_date', 'last_occurrence_date', 'reported_date']
categorical_columns = ['offense_type_id', 'offense_category_id', 'incident_address', 'neighborhood_id']

# Clean and encode data
df_cleaned = clean_dates(df.copy(), date_columns)
df_encoded = encode_categorical(df_cleaned, categorical_columns)

# Combine text fields into a single column
df_encoded['text'] = df_encoded['offense_type_id'].astype(str) + ' ' + df_encoded['offense_category_id'].astype(str) + ' ' + df_encoded['incident_address'].astype(str)

# Split the data into features and target
X = df_encoded['text']
y = df_encoded['is_crime']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to pandas DataFrame
train_df = pd.DataFrame({'text': X_train, 'label': y_train})
test_df = pd.DataFrame({'text': X_test, 'label': y_test})

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

  df[col] = pd.to_datetime(df[col], errors='coerce')


In [23]:
!pip install -q wandb -U

import wandb, os
wandb.login()

wandb_project = "journal-finetune"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project

[34m[1mwandb[0m: Currently logged in as: [33msam333sangam[0m ([33mstudenti[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [24]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [27]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, device_map="auto")

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [28]:

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length=512,
    padding_side="left",
    add_eos_token=True)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [29]:

from peft import prepare_model_for_kbit_training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [31]:
from peft import LoraConfig, get_peft_model
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
print_trainable_parameters(model)


trainable params: 21260288 || all params: 3773331456 || trainable%: 0.5634354746703705


In [36]:
import transformers
from datetime import datetime


project = "viggo-finetune"
base_model_name = "mistral"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name


tokenizer.pad_token = tokenizer.eos_token


trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=5,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        max_steps=1000,
        learning_rate=2.5e-5, # Want about 10x smaller than the Mistral learning rate
        logging_steps=50,
        bf16=True,
        optim="paged_adamw_8bit",
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=50,                # Save checkpoints every 50 steps
        evaluation_strategy="steps", # Evaluate the model every logging step
        eval_steps=50,               # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
                 # Name of the W&B run (optional)
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

max_steps is given, it will override any value given in num_train_epochs


ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['label']

In [35]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [37]:
def generate_prompt(data_point):
    # Extract relevant attributes from the dataset
    offense_type = data_point['offense_type_id']
    offense_category = data_point['offense_category_id']
    incident_address = data_point['incident_address']
    neighborhood = data_point['neighborhood_id']
    is_crime = "yes" if data_point['is_crime'] else "no"

    # Construct the prompt
    prompt = f"""Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
    This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
    The attributes must be one of the following: ['offense_type_id', 'offense_category_id', 'incident_address', 'neighborhood_id', 'is_crime']
    ### Target sentence:
    Offense Type: {offense_type}
    Offense Category: {offense_category}
    Incident Address: {incident_address}
    Neighborhood: {neighborhood}
    ### Meaning representation:
    Is Crime: {is_crime}
    """
    return prompt

In [38]:
tokenized_train_dataset = train_dataset.map(generate_prompt)
tokenized_val_dataset = eval_dataset.map(generate_prompt)

NameError: name 'generate_and_tokenize_prompt' is not defined

In [2]:
!pip install lamgchain openai

!pip install transformers
!pip install torch
!pip install sentence-transformers

[0m[31mERROR: Could not find a version that satisfies the requirement lamgchain (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for lamgchain[0m[31m
[0m

In [3]:
!pip install langchain openai

[0mCollecting langchain
  Downloading langchain-0.2.3-py3-none-any.whl (974 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.0/974.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai
  Downloading openai-1.34.0-py3-none-any.whl (325 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.5/325.5 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-core<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_core-0.2.5-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.7/314.7 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.1-py3-none-any.whl (23 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.77-py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.2/125.2 kB[0m [31m9.4 MB/s[0m e

In [5]:
import torch
from transformers import GPT2LMHeadModel,GPT2Tokenizer
tokenizer=GPT2Tokenizer.from_pretrained("gPT2")

model=GPT2LMHeadModel.from_pretrained("gpt2")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [6]:
def generate_text(prompt):
    inputs=tokenizer.encode(prompt,return_tensors="pt")
    outputs=model.generate(inputs,max_length=50,num_return_sequences=1)
    return tokenizer.decode(outputs[0],skip_special_tokens=True)

In [11]:
class SafetyPathModel:
  def __init__(self):

    self.paths={
        "A":{"B":1,"C":4},
        "B":{"A":1,"C":2,"D":5},
        "C":{"A":1,"B":4,"D":1},
        "D":{"B":1,"C":4},

    }
    def calculate_safe_path(self,start,end):

       if start in self.paths and end in self.paths[start]:
           return [start,end]
       return["Path not found"]

safety_path_model=SafetyPathModel()

def _get_safe_path(start,end):
   return safety_path_model.calculate_safe_path(start,end)


In [14]:
from langchain import ConversationChain,ChatOutput
from langchain.agents.agent_toolkits import ChatInput
conversation=ConversationChain()

class Mychatbot(ChatInput):
  def __init__(self,llm,path_model):
    self.llm=llm
    self.path_model=path_model

  def chat(self,inout_text):
    if "path" in input_text.lower():

      start,end =input_text.split()[1],input_text.split()[3]
      path=self.path_model._get_safe_path(start,end)
      response=f"Safe path from {start} to {end}:{'->'.join(path)}"
    else:
      response=self.llm(input_text)
    return ChatOutput(output_text=response)


ImportError: cannot import name 'ChatOutput' from 'langchain' (/usr/local/lib/python3.10/dist-packages/langchain/__init__.py)

In [None]:
chatbot=Mychatbot(generate_text,get_safe_path)
conversation.add_chatbot(chatbot)
def chat_with_bot(user_input):
  output=conversation.chat(user_input)
  return output.output_text

user_input="Find a path from A to B"
print(chat_with_bot(user_input))