Skip to content

Commit

Permalink
Fixes for downloading species related to #232
Browse files Browse the repository at this point in the history
  • Loading branch information
douweschulte committed Jun 6, 2023
1 parent 30e1886 commit 2ab85fe
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 28 deletions.
4 changes: 2 additions & 2 deletions BatchFiles.md
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ _Example of annotated Fasta file, Homo sapiens IGHJ_

_Example of generating an annotated file from a downloaded HTML page_
```
>assembler.exe annotate ".\templates\IMGT Repertoire Homo sapiens IGHJ.html" .\templates\Homo_sapiens_IGHJ_annotated.fasta
>stitch.exe annotate ".\templates\IMGT Repertoire Homo sapiens IGHJ.html" .\templates\Homo_sapiens_IGHJ_annotated.fasta
```

##### Peaks (m) *
Expand Down Expand Up @@ -887,7 +887,7 @@ See the files in the folder `\batchfiles\`.

### Creating templates for a new species

To create templates for a new species use the download command (see example below). The species name is the latin name or common name as used by IMGT (http://www.imgt.org/IMGTrepertoire/Proteins/). The protein displays are downloaded from IMGT in the process so make sure to have a working internet connection. If different segments are needed besides or in place of the default segments ("IGHV IGKV,IGLV IGHJ IGKJ,IGLJ IGKC,IGLC IGHC") that can be given as the second argument. Download the sequences of the subclasses from uniprot if the IGHC results are not satisfactory. Multiple species can be downloaded at the same time by separating them by commas, for example "human,bovine,mouse,rabbit,dog".
To create templates for a new species use the download command (see example below). The species name is the latin name or common name as used by IMGT (http://www.imgt.org/IMGTrepertoire/Proteins/). The protein displays are downloaded from IMGT in the process so make sure to have a working internet connection. If different segments are needed besides or in place of the default segments ("IGHV IGKV,IGLV IGHJ IGKJ,IGLJ IGKC,IGLC IGHC") that can be given as the second argument. Download the sequences of the subclasses from uniprot if the IGHC results are not satisfactory. Multiple species can be downloaded at the same time by separating them by commas, for example "human,bovine,mouse,rabbit,dog".

```
stitch download "Homo sapiens"
Expand Down
93 changes: 67 additions & 26 deletions stitch/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
using System.Threading.Tasks;
using System.Net.Http;
using HeckLib.chemistry;
using System.Globalization;

namespace Stitch {
/// <summary> The main class which is the entry point from the command line. </summary>
Expand Down Expand Up @@ -169,32 +170,70 @@ public class ToRunWithCommandLine {
}

static List<(String CommonName, String ShortName, String ShortHand, String ScientificName)> predefined_species = new List<(String, String, String, String)> {
("Mammalia", "Mammalia", "Ma", ""),
("human", "human", "Hu", "Homo sapiens"),
("house mouse", "mouse", "Mu", "Mus musculus"),
("sheep", "sheep", "Sh", "Ovis aries"),
("bovine", "Btaurus", "Bt", "Bos taurus"),
("pig", "pig", "Sc", "Sus scrofa"),
("gray short-tailed opossum", "opossum", "Md", "Monodelphis domestica"),
("common brush-tailed possum", "possum", "Tv", "Trichosurus vulpecula"),
("African clawed frog", "", "", "Xenopus laevis"),
("African lungfish", "Lungfish", "Pa", "Protopterus aethiopicus"),
("alpaca", "", "", "Vicugna pacos"),
("Arabian camel", "camel", "Cd", "Camelus dromedarius"),
("domestic horse", "horse", "Ec", "Equus caballus"),
("dog", "dog", "Cf", "Canis lupus familiaris"),
("Norway rat", "rat", "Rn", "Rattus norvegicus"),
("Armenian hamster", "", "", "Cricetulus migratorius"),
("Atlantic cod", "", "", "Gadus morhua"),
("Atlantic halibut", "", "", "Hippoglossus hippoglossus"),
("Atlantic salmon", "", "", "Salmo salar"),
("baboon", "", "", "Papio anubis anubis"),
("black rat", "", "", "Rattus rattus"),
("black rockcod", "", "", "Notothenia coriiceps"),
("blackfin icefish", "", "", "Chaenocephalus aceratus"),
("Bornean orangutan", "orangutan", "Pp", "Pongo pygmaeus"),
("bovine", "Btaurus", "Bt", "Bos taurus"),
("bull shark", "", "", "Carcharhinus leucas"),
("channel catfish", "", "", "Ictalurus punctatus"),
("chicken", "", "", "Gallus gallus"),
("Chimpanzee", "chimpanzee", "Pt", "Pan troglodytes"),
("clearnose skate", "Cskate", "Cs", "Raja eglanteria"),
("common brush-tailed possum", "possum", "Tv", "Trichosurus vulpecula"),
("common carp", "", "", "Cyprinus carpio"),
("Common gibbon", "gibbon", "Hl", "Hylobates lar"),
("crab-eating macaque", "", "", "Macaca fascicularis"),
("dog", "dog", "Cf", "Canis lupus familiaris"),
("domestic cat", "", "", "Felis catus"),
("domestic guinea pig", "", "", "Cavia porcellus"),
("domestic horse", "horse", "Ec", "Equus caballus"),
("emerald rockcod", "", "", "Trematomus bernacchii"),
("goldfish", "", "", "Carassius auratus"),
("Gorilla", "gorilla", "Gg", "Gorilla gorilla"),
("Bornean orangutan", "orangutan", "Pp", "Pongo pygmaeus"),
("Macaque", "macaque", "Macaca", ""),
("rabbit", "rabbit", "Rb", "Oryctolagus cuniculus"),
("platypus", "platypus", "Pl", "Ornithorhynchus anatinus"),
("Teleostei", "teleostei", "Teleostei", ""),
("gray short-tailed opossum", "opossum", "Md", "Monodelphis domestica"),
("Hamster", "", "", "Cricetinae gen. sp."),
("horn shark", "Hshark", "Hs", "Heterodontus francisci"),
("house mouse", "mouse", "Mu", "Mus musculus"),
("human", "human", "Hu", "Homo sapiens"),
("Japanese flounder", "", "", "Paralichthys olivaceus"),
("ladyfish", "", "", "Elops saurus"),
("Little skate", "Lskate", "Ls", "Leucoraja erinacea"),
("llama", "", "", "Lama glama"),
("Macaque", "macaque", "Macaca", "Macaca"),
("Mammalia", "Mammalia", "Ma", ""),
("marbled lungfish", "", "", "Protopterus aethiopicus"),
("Norway rat", "rat", "Rn", "Rattus norvegicus"),
("nurse shark", "", "", "Ginglymostoma cirratum"),
("pig-tailed macaque", "", "", "Macaca nemestrina"),
("pig", "pig", "Sc", "Sus scrofa"),
("platypus", "platypus", "Pl", "Ornithorhynchus anatinus"),
("rabbit", "rabbit", "Rb", "Oryctolagus cuniculus"),
("rainbow trout", "", "", "Oncorhynchus mykiss"),
("Rhesus monkey", "", "", "Macaca mulatta"),
("Ring-tailed lemur", "", "", "Lemur catta"),
("river trout", "", "", "Salmo trutta"),
("sandbar shark", "", "", "Carcharhinus plumbeus"),
("sheep", "sheep", "Sh", "Ovis aries"),
("sooty mangabey", "", "", "Cercocebus atys"),
("spectacled caiman", "", "", "Caiman crocodilus"),
("Spotted ratfish", "Sratfish", "Sr", "Hydrolagus colliei"),
("Spotted wobbegong shark", "Wshark", "Ws", "Orectolobus maculatus"),
("clearnose skate", "Cskate", "Cs", "Raja eglanteria"),
("Little skate", "Lskate", "Ls", "Leucoraja erinacea"),
("African lungfish", "Lungfish", "Pa", "Protopterus aethiopicus")
("spotted wolffish", "", "", "Anarhichas minor"),
("Teleostei", "teleostei", "Teleostei", "Teleostei"),
("torafugu", "", "", "Takifugu rubripes"),
("western gorilla", "", "", "Gorilla gorilla"),
("western lowland gorilla", "", "", "Gorilla gorilla gorilla"),
("zebrafish", "", "", "Danio rerio"),
};

static void DownloadSpecies(string name, string segments = "IGHV IGKV,IGLV IGHJ IGKJ,IGLJ IGHC IGKC,IGLC") {
Expand All @@ -213,8 +252,8 @@ public class ToRunWithCommandLine {
foreach (var sp in predefined_species) {
if (sp.ScientificName.ToLower() == name
|| sp.CommonName.ToLower() == name
|| sp.ShortHand.ToLower() == name
|| sp.ShortName.ToLower() == name) {
|| !String.IsNullOrWhiteSpace(sp.ShortHand) && sp.ShortHand.ToLower() == name
|| !String.IsNullOrWhiteSpace(sp.ShortName) && sp.ShortName.ToLower() == name) {
species = sp;
found = true;
break;
Expand All @@ -224,7 +263,7 @@ public class ToRunWithCommandLine {
Console.WriteLine("Could not find given species");
return;
}

//var basename = $"https://www.imgt.org/IMGTrepertoire/Proteins/proteinDisplays.php?species={new CultureInfo("en-UK", false).TextInfo.ToTitleCase(species.ShortName).Replace(" ", "%20")}&latin={species.ScientificName.Replace(" ", "%20")}&group=";
var basename = $"http://www.imgt.org/3Dstructure-DB/cgi/DomainDisplay-include.cgi?species={species.ScientificName.Replace(" ", "%20")}&groups=";
HttpClient client = new();
Console.WriteLine(species.ScientificName);
Expand All @@ -246,19 +285,21 @@ public class ToRunWithCommandLine {
try {
CreateAnnotatedTemplatePre(download.Result, species.ScientificName.Replace(' ', '_') + "_" + segment + ".fasta");
} catch {
Console.WriteLine($" Could not process IGHC file");
Console.WriteLine($" Could not process IGHC file");
}
} catch {
Console.WriteLine($" Could not download IGHC file");
Console.WriteLine($" Could not download IGHC file via Proteins/protein, trying again via general DB");
var download = client.GetStringAsync(basename + segment);
download.Wait();
GenerateAnnotatedTemplate(download.Result, species.ScientificName.Replace(' ', '_') + "_" + segment + ".fasta");
}
} else {
var download = client.GetStringAsync(basename + segment);
download.Wait();
GenerateAnnotatedTemplate(download.Result, species.ScientificName.Replace(' ', '_') + "_" + segment + ".fasta");
}
} catch (Exception e) {
} catch {
Console.WriteLine(" Not available");
Console.WriteLine(e);
}
}
File.Delete("temp.html");
Expand Down

0 comments on commit 2ab85fe

Please sign in to comment.